from bokeh.io import output_notebook


output_notebook()  # render plots inline


# import the relevant libraries
import pandas as pd


file = "../data/csv_files/titanic_all.csv"

# create new dataframe with only the relevant columns
titanic = pd.read_csv(file)
titanic = titanic.drop(["name", "class", "survived"], axis=1)

# create new dataframe for male and female passengers respectively
female = titanic[titanic["sex"] == "female"]
male = titanic[titanic["sex"] == "male"]

# get age data for the density plots.
f_values = female.age.dropna().values
m_values = male.age.dropna().values
t_values = titanic.age.dropna().values


from bokeh.plotting import figure, show
from sklearn.neighbors import KernelDensity
import numpy as np


# create function to plot the multiple density estimates
def plot_kde(data_dict, title, kernel="gaussian", bandwidth=2, line_color=None):
    """
    Create a density plot using Kernel Density Estimation (KDE) for multiple datasets.

    Parameters:
        data_dict (list of dicts): A list of dictionaries, where each dictionary contains the following keys:
                                   - 'data': The data to be plotted.
                                   - 'color': The color of the filled area.
                                   - 'legend_label': The legend label for the dataset.
        title (str): The title of the plot.
        kernel (str, optional): The type of kernel to use in creating the plot. Default is gaussian.
        bandwidth (float, optional): The bandwidth of the KDE. Higher values result in smoother
                                     but less accurate density plots. Default is 2.
        line_color (str, optional): The color of the lines around the filled areas.
                                    Default is None, which means no lines will be drawn.

    Returns:
        bokeh.plotting.figure.Figure: The Bokeh figure containing the density plot.
    """
    positions = np.linspace(-10, 80, 1000)

    # create figure object
    p = figure(
        title=title,  # plot title
        height=300,  # plot height
        width=500,  # plot width
        toolbar_location=None,  # remove toolbars
        x_axis_label="age (years)",
        y_axis_label="scaled density",
    )

    # loop through each data_dict and plot a density plot for each
    for info in data_dict:
        data = info["data"]
        color = info["color"]
        legend_label = info["legend_label"]

        # create kde object and fit object into 'data' parameter
        kde = KernelDensity(kernel=kernel, bandwidth=bandwidth).fit(data[:, np.newaxis])

        # calculate log-density estimation (log_dens) at each position using the 'score_samples' method
        density = np.exp(kde.score_samples(positions[:, np.newaxis]))

        # scale the density estimation to correspond to the number of data values
        scaled_density = density * len(data)

        p.varea(
            x=positions,  # x-axis coordinates
            y1=scaled_density,  # y-axis coordinates of one side of the area
            y2=0,  # y-axis coordinates of the other side of the area
            fill_alpha=0.9,  # transparency of the filled area
            fill_color=color,  # color of the filled area
            legend_label=legend_label,
        )

    # customize the x-axis
    p.x_range.start = 0
    p.xaxis.ticker = [0, 20, 40, 60]
    p.xgrid.grid_line_color = None
    p.xaxis.axis_line_color = None
    p.xaxis.major_tick_line_color = "gray"
    p.xaxis.major_tick_out = 2

    # customize the y-axis
    p.y_range.start = 0
    p.yaxis.minor_tick_out = 0
    p.yaxis.axis_line_color = None
    p.yaxis.major_tick_line_color = "gray"
    p.yaxis.major_tick_out = 0
    p.yaxis.major_tick_in = 0

    p.legend.location = "top_right"

    return p


# generate a single multiple density plot
data_dict = [
    {"data": m_values, "color": "#5BA4DB", "legend_label": "male"},
    {"data": f_values, "color": "#D0771E", "legend_label": "female"},
]

single = plot_kde(data_dict, "Figure 7.8", line_color="black")
show(single)


# generate two multiple density plots and display them in a grid
from bokeh.layouts import gridplot

male_data = [
    {"data": t_values, "color": "#D5D4D3", "legend_label": "all passengers"},
    {"data": m_values, "color": "#055BB2", "legend_label": "male"},
]

female_data = [
    {"data": t_values, "color": "#D5D4D3", "legend_label": "all passengers"},
    {"data": f_values, "color": "#CB6805", "legend_label": "female"},
]
male = plot_kde(male_data, "Figure 7.9 male passengers")
female = plot_kde(female_data, "Figure 7.9 female passengers")

layout = gridplot([male, female], ncols=2)
show(layout)


file = "../data/csv_files/cows.csv"

df = pd.read_csv(file)

# create dataframes for the four different cattle breeds
jersey = df[df["breed"] == "Jersey"]
holstein = df[df["breed"] == "Holstein-Friesian"]
guernsey = df[df["breed"] == "Guernsey"]
ayrshire = df[df["breed"] == "Ayrshire"]

# get butterfat data for the cattle breeds
j_values = jersey.butterfat.values
h_values = holstein.butterfat.values
g_values = guernsey.butterfat.values
a_values = ayrshire.butterfat.values
positions = np.linspace(2, 8, 1000)


from bokeh.models import Label, CustomJSTickFormatter

# arrange plotting data as pandas DataFrame
data_dict = {
    "values": [a_values, g_values, h_values, j_values],
    "bandwidths": [0.125, 0.25, 0.1, 0.3],
    "colors": ["#409DFA", "#AC5703", "#9E5205", "green"],
    "labels": ["Ayrshire", "Guernsey", "Holstein-Friesian", "Jersey"],
}

df = pd.DataFrame(data_dict)

# Create figure object
p = figure(
    title="figure 7.11",  # plot title
    height=300,  # plot height
    width=600,  # plot width
    x_axis_label="butterfat contents",
    y_axis_label="density",
)

# Loop to calculate KDE and plot vareaes
for _, row in df.iterrows():
    data, bandwidth, color, label = (
        row["values"],
        row["bandwidths"],
        row["colors"],
        row["labels"],
    )

    kde = KernelDensity(kernel="gaussian", bandwidth=bandwidth).fit(data[:, np.newaxis])
    log_dens = kde.score_samples(positions[:, np.newaxis])

    p.varea(
        x=positions,
        y1=np.exp(log_dens),
        y2=0,
        fill_alpha=0.3,
        fill_color=color,
    )

    # Find the highest point and annotate with the label
    max_idx = np.argmax(np.exp(log_dens))
    highest_point_label = Label(
        x=positions[max_idx],
        y=np.exp(log_dens[max_idx]),
        text=label,
        text_font_size="10pt",
        x_offset=20,
        y_offset=-5,
        text_color=color,
    )
    p.add_layout(highest_point_label)

# Convert x-axis labels to percentages
x_axis_labels = {3: "3%", 4: "4%", 5: "5%", 6: "6%", 7: "7%"}
p.xaxis.formatter = CustomJSTickFormatter(
    code="""return tick in %s ? %s[tick] : '';""" % (x_axis_labels, x_axis_labels)
)

# customize x-axis
p.x_range.start = 3
p.xgrid.grid_line_color = None
p.xaxis.axis_line_color = None
p.xaxis.major_tick_line_color = "gray"
p.xaxis.major_tick_out = 2
p.xaxis.minor_tick_out = 0

# customize y-axis
p.yaxis.ticker = [0, 0.5, 1, 1.5]
p.y_range.start = 0
p.yaxis.minor_tick_out = 0
p.yaxis.axis_line_color = None
p.yaxis.major_tick_line_color = "gray"
p.yaxis.major_tick_out = 0
p.yaxis.major_tick_in = 0

show(p)


# create dataframe for only the males and females
female = titanic[titanic["sex"] == "female"]
male = titanic[titanic["sex"] == "male"]

# extract age data from the dataframes
m_age = male.age.dropna()
f_age = female.age.dropna()

# compute histograms for both datasets
bins = np.arange(0, 80, 3)
m_hist, edges = np.histogram(m_age, bins=bins)
f_hist, edges = np.histogram(f_age, bins=bins)


from bokeh.models import Label, CustomJSTickFormatter

# create figure object
p = figure(
    title="Figure 7.10",  # plot title
    height=400,  # plot height
    width=600,  # plot width
    x_range=(-60, 40),  # range of x-axis values to display
    toolbar_location=None,  # remove toolbars
    x_axis_label="count",
    y_axis_label="age (years)",
)

# plot male histogram
p.hbar(
    right=m_hist * -1,  # right endpoints of bars
    y=edges[1:],  # y-axis values
    height=2,  # bar height
    color="#055BB2",
)

# plot female histogram
p.hbar(
    right=f_hist,
    y=edges[1:],
    height=2,
    color="#CB6805",
)

# customise x-axis and y-axis
p.xaxis.ticker = [-40, -20, 0, 20, 40]
p.yaxis.ticker = [0, 20, 40, 60]
p.y_range.start = 1.5

# create custom formatter function to make all tick labels positive


def positive_labels():
    return "return Math.abs(tick);"


# apply the custom formatter to the x-axis using CustomJSTickFormatter
p.xaxis.formatter = CustomJSTickFormatter(args=dict(), code=positive_labels())

# add labels
m_label = Label(x=-40, y=70, text="male", text_font_size="15pt", x_offset=5)
f_label = Label(x=20, y=70, text="female", text_font_size="15pt", x_offset=5)

p.add_layout(m_label)
p.add_layout(f_label)

show(p)

Visualizing multiple distributions with histograms and density plots¶

Multiple distribution density plots¶

A.¶

Data preparation¶

Plotting¶

B.¶

Data preparation¶

Plotting¶

Multiple distribution histogram¶

Data preparation¶

Plotting¶