from bokeh.io import output_notebook
import pandas as pd

output_notebook()  # render plots inline in notebook


file = "../data/csv_files/lincoln.csv"
df = pd.read_csv(file)

df["DATE"] = pd.to_datetime(df["DATE"])
df["TAVG"] = (df["TMAX"] + df["TMIN"]) / 2
df["MONTH"] = df.DATE.dt.strftime("%b")

df = df[
    [
        "MONTH",
        "TMIN",
        "TMAX",
        "TAVG",
    ]
]

qs = df.groupby("MONTH").TAVG.quantile([0.25, 0.5, 0.75]).unstack().reset_index()
qs.columns = ["MONTH", "Q1", "Q2", "Q3"]

iqr = qs.Q3 - qs.Q1
qs["upper"] = qs.Q3 + 1.5 * iqr
qs["lower"] = qs.Q1 - 1.5 * iqr
df = pd.merge(df, qs, on="MONTH", how="left")

df.head()


from bokeh.models import ColumnDataSource, Whisker
from bokeh.plotting import figure, show


# create figure object
p = figure(
    title="Figure 9.3",
    x_range=df.MONTH.unique(),
    toolbar_location=None,
    height=400,
    width=600,
    x_axis_label="month",
    y_axis_label="mean temperature (F)",
)

# create column data source object from the dataframe
source = ColumnDataSource(df)

# create whisker object and add it to figure
whisker = Whisker(base="MONTH", upper="upper", lower="lower", source=source)
whisker.upper_head.size = whisker.lower_head.size = 20
p.add_layout(whisker)

# create boxplot using two vbar() glyphs
p.vbar(
    x="MONTH",
    top="Q2",
    bottom="Q1",
    width=0.8,
    source=source,
    color="#E0E0E0",
    line_color="black",
)

p.vbar(
    x="MONTH",
    top="Q3",
    bottom="Q2",
    width=0.8,
    source=source,
    color="#E0E0E0",
    line_color="black",
)

# plot outliers using scatter() glyph
outliers = df[~df.TAVG.between(df.lower, df.upper)]
p.scatter("MONTH", "TAVG", source=outliers, size=5, color="black")

# customize plot
p.y_range.start = -10
p.yaxis.ticker = [0, 25, 50, 75]
p.grid.grid_line_color = None

show(p)


import numpy as np

months = list(df.MONTH.unique())


from sklearn.neighbors import KernelDensity


# create figure object
p = figure(
    title="Figure 9.8",
    x_range=months,
    toolbar_location=None,
    height=400,
    width=500,
    x_axis_label="month",
    y_axis_label="mean temperature (F)",
)

# create an offset for each category in the data


def offset(category, data, scale=7):
    return list(zip([category] * len(data), scale * data))


for month in months:
    month_df = df[df.MONTH == month].dropna()
    tavg = month_df.TAVG.values
    temps = np.linspace(tavg.min(), tavg.max(), 50)

    # calculate the KDE for each month and plot the data
    kde = KernelDensity(kernel="gaussian", bandwidth=3).fit(tavg[:, np.newaxis])
    density = np.exp(kde.score_samples(temps[:, np.newaxis]))
    x1, x2 = offset(month, density), offset(month, -density)

    p.harea(x1=x1, x2=x2, y=temps, alpha=0.8, color="#E0E0E0")

    # create jitter to offset temperature data points
    tavg_density = np.exp(kde.score_samples(tavg[:, np.newaxis]))
    jitter = (np.random.random(len(tavg)) * 2 - 1) * tavg_density

    # create a scatter plot for the average temp in each month
    p.scatter(
        x=offset(month, jitter),
        y=tavg,
        color="black",
    )

# customize plot
p.y_range.start = -10
p.yaxis.ticker = [0, 25, 50, 75]
p.grid.grid_line_color = None

show(p)


file = "../data/csv_files/dw_nominate_house.csv"

df = pd.read_csv(file)

# add year column by multiplying each congress by 2 from the year 1787
df["year"] = (df.congress) * 2 + 1787

# select only the relevant columns from the year 1963 onwards
year = df["year"] >= 1962
parties = (df["party_code"] == 100) | (df["party_code"] == 200)
dn = (df["cd"] != 0) & (df["cd"] != 98) & (df["cd"] != 99)

df = df[year & parties & dn].reset_index(drop=True)

# create two dataframes for both political parties
dems = df[df["party_code"] == 100]
repubs = df[df["party_code"] == 200]


from bokeh.models import ColumnDataSource
import numpy as np
from scipy.stats import gaussian_kde
from sklearn.neighbors import KernelDensity

# create a column data source containing the KDE data for each


def tweak_df(df):
    """
    Calculate Kernel Density Estimates (KDE) for each year in the input DataFrame.

    Parameters:
        df (pd.DataFrame): Input DataFrame with columns "year" and "dim_1".

    Returns:
        bokeh.models.ColumnDataSource: A ColumnDataSource containing KDE data for each year.

    Raises:
        ValueError: If the input DataFrame does not have the required columns or if there's an issue with KDE calculation.
    """
    # Validate input DataFrame
    if not isinstance(df, pd.DataFrame):
        raise ValueError("Input must be a pandas DataFrame.")
    if "year" not in df.columns or "dim_1" not in df.columns:
        raise ValueError("Input DataFrame must have columns 'year' and 'dim_1'.")

    grouped = df.groupby("year").dim_1

    # create Column Data Source object to hold the KDE data
    scores = np.linspace(-1.5, 1.5, 500)
    blank = np.linspace(100, 200, 500)
    source = ColumnDataSource(data=dict(scores=scores))
    # add extra columns to create blank space in the KDE plot
    source.add(blank, "1957")
    source.add(blank, "1959")
    source.add(blank, "1961")

    def ridge(category, data):
        return list(zip([category] * len(data), data))

    # Calculate KDE for each column and create plotting ridge
    try:
        for year, vote in grouped:
            year = str(year)
            pdf = gaussian_kde(vote)
            y = ridge(year, pdf(scores))
            source.add(y, year)
    except Exception as e:
        raise ValueError(f"Error occurred during KDE calculation: {str(e)}")

    return source


from bokeh.models import ColumnDataSource
from bokeh.plotting import figure


def plot_ridges(df1, df2=None):
    """
    Plot ridges for two DataFrames.

    Parameters:
        df1: ColumnDataSource containing data for the first category.
        df2 (optional): ColumnDataSource containing data for the second category.

    Returns:
        bokeh.plotting.figure: A Bokeh figure showing the ridges for both categories.
    """
    # Input validation
    if not isinstance(df1, ColumnDataSource):
        raise ValueError(f"{df1} must be a valid ColumnDataSource")

    if df2 is not None and not isinstance(df2, ColumnDataSource):
        raise ValueError(f"{df2} must be a valid ColumnDataSource")

    # Get the list of categories from the data keys
    years = list(reversed(df1.data.keys()))[:-1]

    # Create a figure object
    p = figure(
        title="Figure 9.12",
        x_axis_label="DW-NOMINATE score",
        y_axis_label="year",
        toolbar_location=None,
        x_range=(-0.75, 1.5),
        y_range=years,
        height=400,
        width=600,
    )

    # Plot ridges for each category
    for i, year in enumerate(years):
        # Plot ridges for the first category
        p.patch(
            x="scores",
            y=year,
            source=df1,
            fill_color="blue",
            line_color="white",
            legend_label="Democrats",
            fill_alpha=0.5,
        )

        # Plot ridges for the second category if df2 is provided
        if df2 is not None:
            # Input validation: Check if df2 has the same keys as df1
            if not set(df2.data.keys()) == set(df1.data.keys()):
                raise ValueError(f"{df2} must have the same keys as {df1}")

            p.patch(
                x="scores",
                y=year,
                source=df2,
                fill_color="red",
                line_color="white",
                legend_label="Republicans",
                alpha=0.4,
            )

    # Customize plot
    p.xaxis.ticker = [-0.75, -0.5, -0.25, 0.00, 0.25, 0.5, 0.75, 1.00]
    p.ygrid.grid_line_color = None
    p.legend.orientation = "horizontal"
    p.legend.border_line_color = None

    return p


# create ridgeline plot for both dataframes
rep_cds = tweak_df(repubs)
dem_cds = tweak_df(dems)

ridgeplot = plot_ridges(dem_cds, rep_cds)

show(ridgeplot)

	MONTH	TMIN	TMAX	TAVG	Q1	Q2	Q3	upper	lower
0	Jan	15.0	36.0	25.5	23.0	27.5	31.5	44.25	10.25
1	Jan	18.0	39.0	28.5	23.0	27.5	31.5	44.25	10.25
2	Jan	15.0	32.0	23.5	23.0	27.5	31.5	44.25	10.25
3	Jan	15.0	27.0	21.0	23.0	27.5	31.5	44.25	10.25
4	Jan	21.0	40.0	30.5	23.0	27.5	31.5	44.25	10.25

Visualizing many distributions at once using boxplots and ridgeline plots¶

Boxplot¶

Data preparation¶

Plotting¶

Sina plot¶

Data preparation¶

Plotting¶

Ridgeline plot¶

Data preparation¶

Plotting¶