# import the relevant libraries
import pandas as pd
from bokeh.io import output_notebook


output_notebook()  # render plots inline in notebook


file = "../data/csv_files/movies.csv"
df = pd.read_csv(file)

# get only first part of title
df["Title"] = df["Title"].apply(lambda x: x.split(":")[0])

# format the "weekend_gross" values as integers
# divide values by 1,000,000 to save space on y-axis
df["Weekend gross"] = df["Weekend gross"].apply(
    lambda x: (int(x.split("$")[1])) / 1_000_000
)

df


from bokeh.plotting import figure, show


# plot a vertical bar

# create figure object
p = figure(
    x_range=df.Title,  # range of categorical values for the x-axis
    height=300,  # plot height
    width=600,  # plot width
    title="Figure 6.1 movie gross",  # title of plot
    y_axis_label="weekend gross (million USD)",  # y-axis label
)

p.vbar(
    x="Title",  # column name for x-axis values
    top="Weekend gross",  # column name for the height of bars
    width=0.7,  # bar width
    color="#66B2FF",  # hex code for bar color
    source=df,  # data source for the column names
)

p.y_range.start = 0  # start y-axis range from 0

show(p)  # display plot


# plot a horizontal bar

df = df.sort_values("Weekend gross")  # sort the named column in ascending order

p = figure(
    y_range=df.Title,  # range of categorical values for y-axis
    height=300,
    title="Figure 6.3 weekend gross",
    x_axis_label="weekend gross (million USD)",
    sizing_mode="stretch_width",  # make plot width responsive to size of screen
)

p.hbar(
    y="Title",  # column name for y-axis values
    right="Weekend gross",  # column name for right endpoints of bars
    height=0.9,  # bar height
    color="#66B2FF",
    source=df,
)

p.x_range.start = 0

show(p)


file = "../data/csv_files/income_by_age.csv"
df = pd.read_csv(file)

# select rows containing the different races and sort by "race" and "age"
age_group = (
    df.sort_values(["race", "age"])
    .reset_index(drop=True)
    .iloc[7:35, :]
    .reset_index(drop=True)
)

# group by "age" and "race" with the median income as values
age_group = age_group.groupby(["age", "race"])[["median_income"]].sum()
age_group = age_group.unstack().reset_index()
age_group.columns = age_group.columns.droplevel(level=0)
age_group = age_group.rename(columns={"": "age"})

age_group


from bokeh.models import NumeralTickFormatter as NTF
from bokeh.palettes import Blues5
from bokeh.transform import dodge


# plot grouped bars

p = figure(
    title="Figure 6.7 median income by age group",
    height=350,
    sizing_mode="stretch_width",
    x_range=age_group.age,
)

bar_width = 0.2

p.vbar(
    x=dodge("age", -0.3, range=p.x_range),  # use dodge to create bar offsets on x-axis
    top="asian",
    source=age_group,
    width=bar_width,
    color=Blues5[0],  # add plot color using color palette
    legend_label="Asian",  # add legend label to plot
)

p.vbar(
    x=dodge("age", -0.1, range=p.x_range),
    top="white",
    source=age_group,
    width=bar_width,
    color=Blues5[1],
    legend_label="White",
)

p.vbar(
    x=dodge("age", 0.1, range=p.x_range),
    top="hispanic",
    source=age_group,
    width=bar_width,
    color=Blues5[2],
    legend_label="Hispanic",
)

p.vbar(
    x=dodge("age", 0.3, range=p.x_range),
    top="black",
    source=age_group,
    width=bar_width,
    color=Blues5[3],
    legend_label="Black",
)

# plot customization

# Remove x-axis lines, grid, and tick marks
p.xaxis.axis_line_color = None
p.xaxis.major_tick_line_color = None
p.xaxis.major_tick_out = 0
p.xgrid.grid_line_color = None

# remove y-axis lines and ticks
p.yaxis.minor_tick_out = 0
p.yaxis.axis_line_color = None
p.yaxis.major_tick_out = 0

# start and endpoints of y-axis
p.y_range.start = 0
p.y_range.end = 100_000

# format y-axis labels to $ sign with thousand delimiters
p.yaxis.formatter = NTF(format="$0,0")


show(p)


# add a name for each race dataframe
asian = df.iloc[21:28, :].drop(["year", "race"], axis=1)
asian.name = "Asian"

black = df.iloc[28:35, :].drop(["year", "race"], axis=1)
black.name = "Black"

hispanic = df.iloc[35:, :].drop(["year", "race"], axis=1)
hispanic.name = "Hispanic"

white = df.iloc[7:14, :].drop(["year", "race"], axis=1)
white.name = "White"

races = (asian, white, hispanic, black)


import pandas as pd
from bokeh.plotting import figure
from bokeh.models import FactorRange
from bokeh.models.formatters import NumeralTickFormatter as NTF


# create a function to plot individual bar plots


def plot_bars(df: pd.DataFrame) -> figure:
    """
    Creates a bar chart using Bokeh to visualize median income by age.

    Parameters:
        df (pd.DataFrame): The pandas DataFrame containing the data.
            It should have the following columns:
            - age: String or numeric values representing the age groups.
            - median_income: Numeric values representing the median income for each age group.

    Returns:
        figure: A Bokeh figure object representing the bar chart.

    Raises:
        ValueError: If the required columns (age and median_income) are not present in the DataFrame.
        TypeError: If the data types of the columns are not compatible with the plot.

    Example:
        df = pd.DataFrame({'age': ['18-24', '25-34', '35-44'], 'median_income': [50000, 60000, 70000]})
        plot = plot_bars(df)
        show(plot)
    """
    # Data validation
    if "age" not in df.columns or "median_income" not in df.columns:
        raise ValueError("The DataFrame must have 'age' and 'median_income' columns.")

    if not pd.api.types.is_numeric_dtype(df["median_income"]):
        raise TypeError("The 'median_income' column must contain numeric values.")

    if not pd.api.types.is_numeric_dtype(df["age"]):
        factors = df["age"].unique().tolist()
        df["age"] = pd.Categorical(df["age"], categories=factors, ordered=True)

    # Function implementation
    p = figure(
        title=f"Figure 6.9: {df.name}",
        height=300,
        width=400,
        x_range=FactorRange(factors=df.age),
        toolbar_location=None,
    )

    p.vbar(x="age", top="median_income", color="#99CCFF", source=df, width=0.9)

    p.xgrid.grid_line_color = None
    p.xaxis.major_tick_out = 0
    p.yaxis.formatter = NTF(format="$0,0")
    p.xaxis.axis_label = "age (years)"
    p.yaxis.axis_label = "median income (USD)"
    p.yaxis.minor_tick_out = 0
    p.yaxis.major_tick_out = 0
    p.y_range.start = 0
    p.y_range.end = 110_000

    return p


from bokeh.layouts import gridplot


# plot the individual bars and append plots in a list
plots = []
for race in races:
    plot = plot_bars(race)
    plots.append(plot)

# use gridplot to arrange the plots in rows of two
layout = gridplot([plots[:2], plots[-2:]])

show(layout)


file = "../data/csv_files/titanic_all.csv"
df = pd.read_csv(file)

# group by class and count the passenger number by sex
by_class = (
    df.groupby("class").sex.value_counts().unstack().drop("*", axis=0).reset_index()
)

# rename values in "class" column to match book labels
by_class["class"] = ["1st class", "2nd class", "3rd class"]

by_class


# plot stacked bars

p = figure(
    title="Figure 6.10 titanic passengers",
    height=300,
    width=500,
    x_range=by_class["class"],
    toolbar_location=None,
)

# create stacked bars
p.vbar_stack(
    ["male", "female"],  # column names of bars to stack
    x="class",  # column name for categorical x-axis
    source=by_class,  # data source for column names
    width=0.9,  # bar width
    line_width=2.5,  # increased line width to separate the bars
    line_color="white",  # line color separating the bars
    color=["#0000CC", "#CC6600"],  # colors for each bar stack
    legend_label=["male passengers", "female passengers"],
)

# add text to bars
p.text(
    x="class",  # column name for horizontal position of text
    y="male",  # column name for vertical position of text
    text="male",  # text value
    source=by_class,  # data source for column names
    y_offset=40,  # vertical offset of text
    text_color="#FFFFFF",
    text_align="center",  # align text to center of bars
)

p.text(
    x="class",
    y="male",
    text="female",
    source=by_class,
    y_offset=-10,
    text_color="#FFFFFF",
    text_align="center",
)


# plot customization

# remove x-axis line and ticks
p.xaxis.axis_line_color = None
p.xaxis.axis_line_width = 0
p.xaxis.major_tick_out = 0

# remove y-axis lines, ticks, and labels. start y-axis at 0
p.y_range.start = 0
p.yaxis.visible = False

# remove plot grid lines and outline
p.grid.grid_line_color = None
p.outline_line_color = None

# position legend location and orientation
p.legend.location = "top_left"
p.legend.orientation = "vertical"

show(p)

	Rank	Title	Weekend gross
0	1	Star Wars	71.565498
1	2	Jumanji	36.169328
2	3	Pitch Perfect 3	19.928525
3	4	The Greatest Showman	8.805843
4	5	Ferdinand	7.316746

race	age	asian	black	hispanic	white
0	15 to 24	45809	30267	45080	44588
1	25 to 34	80098	39176	45876	65389
2	35 to 44	100443	49336	50245	78093
3	45 to 54	98925	50103	58103	82289
4	55 to 64	91193	40363	51996	69387
5	65 to 74	56646	28697	36704	52219
6	> 74	26487	22302	23797	32203

Visualizing amounts with bar plots¶

Vertical and horizontal bars¶

Data preparation¶

Plotting¶

Grouped and stacked bars¶

A. Grouped Bars¶

Data preparation¶

Plotting¶

B. Grid of bar plots¶

Data preparation¶

Plotting¶

C. Stacked bars¶

Data preparation¶

Plotting¶

sex	class	female	male
0	1st class	143.0	179.0
1	2nd class	107.0	172.0
2	3rd class	212.0	499.0