Setup - Import Libraries¶

In [1]:
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.model_selection import TimeSeriesSplit, train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import StandardScaler

import numpy as np
import pandas as pd
import plotly.express as px
import plotly.io as pio
pio.templates.default = "plotly_dark"

from tqdm.auto import tqdm

Load Data¶

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/wsovine/data/main/nhl_moneyline_model_logistic_regression_2022_season.csv')

Create Time Series Split¶

  1. Convert the spark dataframe to a pandas dataframe
  2. Obtain predictor and dependent variables
  3. Split into different time series buckets (~25 weeks in NHL season)
In [3]:
df.sort_values('game_date', inplace=True)
df.reset_index(inplace=True, drop=True)

X = df.copy()
X = X.drop(['home', 'away', 'game_date', 'season',
             'home_goals', 'away_goals', 'home_win'], axis=1)

y = df.home_win.values

print(X.shape)

tss = TimeSeriesSplit(n_splits=25)
(1399, 92)

Define Helper Functions¶

Functions to scale predictors and add one-hot encode categorical variables

In [4]:
def scale(X_train, X_test) -> (pd.DataFrame, pd.DataFrame):
    scaler = StandardScaler()
    X_train.loc[:,:] = scaler.fit_transform(X_train)
    X_test.loc[:,:] = scaler.transform(X_test)

    return (X_train, X_test)

def dummy(X_train, X_test, original_df) -> (pd.DataFrame, pd.DataFrame):
    home_dummies = pd.get_dummies(original_df.home, prefix='home')
    X_train = X_train.join(home_dummies, how='inner')
    X_test = X_test.join(home_dummies, how='inner')

    away_dummies = pd.get_dummies(original_df.away, prefix='away')
    X_train = X_train.join(away_dummies, how='inner')
    X_test = X_test.join(away_dummies, how='inner')

    return (X_train, X_test)

def scale_and_dummy(X_train, X_test, original_df) -> (pd.DataFrame, pd.DataFrame):
    X_train, X_test = scale(X_train, X_test)
    X_train, X_test = dummy(X_train, X_test, original_df)

    return (X_train, X_test)

Functions for calculations that involve the odds and calculating profit

In [5]:
# This function converts american odds to win probability
def odds_to_profit(odds, u = 1):
    if odds < 0:
        profit = -100 / odds
    else:
        profit = odds / 100
    return profit * u

def odds_to_loss(odds, u = 1):
    if odds < 0:
        loss = odds / -100
    else:
        loss = 100 / odds
    return -loss * u

def odds_to_prob(odds):
    if odds < 0:
        prob = odds / (odds - 100)
    else:
        prob = 100 / (odds + 100)
    return prob


def odds_to_dec(odds):
    if odds < 0:
        dec = (100 / -odds)
    else:
        dec = (odds / 100)
    return dec #+ 1

def profitize(df, aggregated=True, kelly_scale=100):
    df['away_win'] = np.where(df.home_win, 0, 1)

    df['home_decimal_odds'] = df.home_consensus.apply(lambda o: odds_to_dec(o))
    df['away_decimal_odds'] = df.away_consensus.apply(lambda o: odds_to_dec(o))

    # Determine the profit or loss for a bet on either team in each matchup
    df['bet_on_home'] = np.where(
        df.home_prob > df.home_implied_prob, 1, 0
        )
    df['bet_on_away'] = np.where(
        df.away_prob > df.away_implied_prob, 1, 0
        )

    # Flat 1u bet
    df['home_profit_if_win'] = df.home_consensus.apply(lambda o: odds_to_profit(o))
    df['away_profit_if_win'] = df.away_consensus.apply(lambda o: odds_to_profit(o))

    df['home_pnl_if_bet'] = np.where(df.home_win, df.home_profit_if_win, -1)
    df['away_pnl_if_bet'] = np.where(df.away_win, df.away_profit_if_win, -1)

    df['flat_1u_pnl'] = (df.bet_on_home * df.home_pnl_if_bet) + (df.bet_on_away * df.away_pnl_if_bet)

    # Bet to win 1u
    df['home_loss_if_lose'] = df.home_consensus.apply(lambda o: odds_to_loss(o))
    df['away_loss_if_lose'] = df.away_consensus.apply(lambda o: odds_to_loss(o))

    df['home_pnl_if_bet'] = np.where(df.home_win, 1, df.home_loss_if_lose)
    df['away_pnl_if_bet'] = np.where(df.away_win, 1, df.away_loss_if_lose)

    df['win_1u_pnl'] = (df.bet_on_home * df.home_pnl_if_bet) + (df.bet_on_away * df.away_pnl_if_bet)

    # Kelly Criterion
    df['home_kelly_bet'] = np.where(
        df.home_prob > df.home_implied_prob, 
        (df.home_prob - df.away_prob / df.home_decimal_odds) * kelly_scale,
        0)
    df['away_kelly_bet'] = np.where(
        df.away_prob > df.away_implied_prob, 
        (df.away_prob - df.home_prob / df.away_decimal_odds) * kelly_scale,
        0)
    
    df['home_profit_if_win'] = df.apply(lambda r: odds_to_profit(r.home_consensus, r.home_kelly_bet), axis=1)
    df['away_profit_if_win'] = df.apply(lambda r: odds_to_profit(r.away_consensus, r.away_kelly_bet), axis=1)

    df['home_pnl_if_bet'] = np.where(df.home_win, df.home_profit_if_win, df.home_kelly_bet * -1)
    df['away_pnl_if_bet'] = np.where(df.away_win, df.away_profit_if_win, df.away_kelly_bet * -1)

    df['kelly_pnl'] = (df.bet_on_home * df.home_pnl_if_bet) + (df.bet_on_away * df.away_pnl_if_bet)

    if aggregated:
        return df.flat_1u_pnl.sum(), df.win_1u_pnl.sum(), df.kelly_pnl.sum()
    return df.flat_1u_pnl, df.win_1u_pnl, df.kelly_pnl

Obtain Accuracy Baseline¶

In [6]:
results = []

for i, (train_index, test_index) in enumerate(tss.split(X)):
    X_test = X.iloc[test_index]
    y_test = y[test_index]

    y_pred = np.where(X_test.home_implied_prob >= .5, 1, 0)
    y_prob = X_test.home_implied_prob

    accuracy = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_prob)

    results.append({
        # 'Fold': i,
        'Accuracy': accuracy,
        'ROC AUC': roc_auc
    })

df_results = pd.DataFrame(results)
print(f'Average Accuracy: {df_results.Accuracy.mean()}')
print(f"Average ROC AUC: {df_results['ROC AUC'].mean()}")

display(df_results.style.background_gradient(vmin=0, vmax=1, cmap="viridis"))
Average Accuracy: 0.5894339622641509
Average ROC AUC: 0.6569917363125215
  Accuracy ROC AUC
0 0.566038 0.555714
1 0.528302 0.697143
2 0.509434 0.505000
3 0.622642 0.677857
4 0.528302 0.617302
5 0.603774 0.746334
6 0.584906 0.671652
7 0.603774 0.688571
8 0.622642 0.725714
9 0.509434 0.695714
10 0.566038 0.644286
11 0.603774 0.656429
12 0.679245 0.678986
13 0.603774 0.514493
14 0.566038 0.631696
15 0.603774 0.609687
16 0.547170 0.625000
17 0.603774 0.718661
18 0.566038 0.618116
19 0.622642 0.716236
20 0.603774 0.669516
21 0.735849 0.816667
22 0.698113 0.784058
23 0.566038 0.764663
24 0.490566 0.395299

Logistic Regression¶

Model Using Time Series Split¶

In [7]:
results = []
coefs = []

for train_index, test_index in tqdm(tss.split(X), total=tss.n_splits):
    X_train = X.iloc[train_index].copy()
    X_test = X.iloc[test_index].copy()
    X_train, X_test = scale_and_dummy(X_train, X_test, df)

    y_train = y[train_index]
    y_test = y[test_index]

    logr = LogisticRegressionCV(solver='liblinear')
    logr.fit(X_train, y_train)

    y_pred = logr.predict(X_test)
    y_prob = logr.predict_proba(X_test)[:,1]

    df_test = df.iloc[test_index].copy()
    df_test['home_prob'] = y_prob
    df_test['away_prob'] = 1 - df_test.home_prob

    accuracy = accuracy_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_prob)
    flat_1u_pnl, win_1u_pnl, kelly_pnl = profitize(df_test, kelly_scale=100)

    results.append({
        'C': logr.C_,
        'Accuracy': accuracy,
        'ROC AUC': roc_auc,
        'Flat 1u PNL': flat_1u_pnl,
        'Win 1u PNL': win_1u_pnl,
        'Kelly PNL': kelly_pnl
    })

    coefs.append(
        pd.DataFrame(
            zip(logr.feature_names_in_, logr.coef_[0]), 
            columns=['feature', 'coef']
            )
        )

df_results = pd.DataFrame(results)
print(f'Average Accuracy: {df_results.Accuracy.mean()}')
print(f"Average ROC AUC: {df_results['ROC AUC'].mean()}")
print(f"Flat 1u PNL: {df_results['Flat 1u PNL'].sum()}")
print(f"Win 1u PNL: {df_results['Win 1u PNL'].sum()}")
print(f"Kelly PNL: {df_results['Kelly PNL'].sum()}")

display(df_results.style.background_gradient(vmin=0, vmax=1, cmap="viridis"))
  0%|          | 0/25 [00:00<?, ?it/s]
Average Accuracy: 0.6022641509433962
Average ROC AUC: 0.651997574729799
Flat 1u PNL: 27.120780967526894
Win 1u PNL: 33.702595095243325
Kelly PNL: 3.021424688039474
  C Accuracy ROC AUC Flat 1u PNL Win 1u PNL Kelly PNL
0 [0.35938137] 0.471698 0.438571 -5.776715 -9.966163 -349.027682
1 [0.00077426] 0.603774 0.672857 2.298095 4.492207 -72.754322
2 [0.00077426] 0.433962 0.480000 2.649231 -2.562134 194.903584
3 [0.00077426] 0.584906 0.668571 -6.033787 -2.979082 -109.172533
4 [0.0001] 0.584906 0.609971 2.990000 1.096302 102.046840
5 [0.00077426] 0.660377 0.730205 3.690484 6.535655 36.681473
6 [0.0001] 0.660377 0.673789 0.102381 -0.693778 25.834222
7 [0.0001] 0.622642 0.687143 -1.137619 0.363564 -14.487678
8 [0.0001] 0.660377 0.740000 -8.330952 -2.445655 -185.463058
9 [0.0001] 0.603774 0.702857 4.316075 5.316572 -9.910492
10 [0.0001] 0.566038 0.638571 -3.478528 -1.514334 -42.217336
11 [0.00077426] 0.641509 0.660000 5.252660 4.839349 26.842245
12 [0.00599484] 0.566038 0.682609 -6.012071 -2.417448 -51.766045
13 [0.00599484] 0.566038 0.528986 10.934954 4.916761 57.730433
14 [0.00077426] 0.584906 0.623512 6.427688 5.441498 68.806396
15 [0.00077426] 0.584906 0.608262 5.525149 1.073741 71.747317
16 [0.00077426] 0.566038 0.626488 15.041337 11.492202 115.592080
17 [0.00077426] 0.641509 0.739316 -5.543025 -0.760136 -39.132553
18 [0.00599484] 0.603774 0.659420 13.121338 8.414751 219.898644
19 [0.00077426] 0.603774 0.731322 5.481316 9.831838 29.108564
20 [0.00077426] 0.622642 0.676638 -7.621014 -8.353649 -27.779952
21 [0.00077426] 0.792453 0.811594 -10.724180 -4.130716 -53.483497
22 [0.00077426] 0.735849 0.802899 -9.417097 -4.942845 -70.578776
23 [0.00599484] 0.641509 0.733138 2.964514 6.827927 60.380665
24 [0.00077426] 0.452830 0.373219 10.400548 3.826171 19.222883

Coefficients¶

In [8]:
pd.concat(coefs, axis=0).groupby('feature').agg(
    {'coef': ['mean', 'median', 'std']}
    ).sort_values(('coef', 'median'), key=abs, ascending=False).head()
Out[8]:
coef
mean median std
feature
home_goal_diff_offense_str -0.040995 -0.028893 0.055133
away_goal_diff_defense_str -0.017175 -0.020192 0.048657
home_implied_prob 0.018102 0.013422 0.019904
away_implied_prob -0.016325 -0.013054 0.015817
blocks_hia -0.017953 -0.012090 0.028508

Model Using Each Day¶

In [9]:
dfs = []
game_dates = df.groupby(['season', 'game_date']).count().reset_index()[['season', 'game_date']]
kelly_bankroll = 0

all_game_dates = tqdm(game_dates.itertuples(), total=game_dates.shape[0])

for scope in all_game_dates:
    y_train = y[(df.game_date < scope.game_date) & (df.season == scope.season)]

    if (len(y_train) >= 16 
        and y_train[y_train == 0].shape[0] > 5 
        and y_train[y_train == 1].shape[0] > 5):

        X_train = X[(df.game_date < scope.game_date) & (df.season == scope.season)].copy()
        df_test = df[df.game_date == scope.game_date]
        X_test = X.iloc[df_test.index].copy()

        df_test = df_test[['season', 'game_date', 'home', 'away', 'home_goals', 'away_goals', 'home_win', 'home_consensus', 'away_consensus', 'home_implied_prob', 'away_implied_prob']]
        df_test['away_win'] = np.where(df_test.home_win, 0, 1)

        X_train, X_test = scale_and_dummy(X_train, X_test, df)

        logr = LogisticRegression(C=.00077426, solver='liblinear')
        logr.fit(X_train, y_train)

        y_pred = logr.predict(X_test)
        y_prob = logr.predict_proba(X_test)[:,1]

        df_test['home_prob'] = y_prob
        df_test['away_prob'] = 1 - df_test.home_prob

        flat_1u_pnl, win_1u_pnl, kelly_pnl = profitize(df_test, aggregated=False, kelly_scale=100)
        df_test['flat_1u_pnl'] = flat_1u_pnl
        df_test['win_1u_pnl'] = win_1u_pnl
        df_test['kelly_pnl'] = kelly_pnl
        kelly_bankroll += kelly_pnl.sum()

        all_game_dates.set_description(f'{scope.game_date} | ${kelly_bankroll:.0f}')
                
        dfs.append(df_test)

df_test = pd.concat(dfs)
# display(df_test)
  0%|          | 0/221 [00:00<?, ?it/s]
In [10]:
df_pnl = df_test.copy()

df_pnl['game_date'] = pd.to_datetime(df_pnl.game_date, format='%Y%m%d')

df_pnl['cum_flat_1u_pnl'] = df_pnl.groupby('season').flat_1u_pnl.transform('cumsum')
df_pnl['cum_win_1u_pnl'] = df_pnl.groupby('season').win_1u_pnl.transform('cumsum')
df_pnl['cum_kelly_pnl'] = df_pnl.groupby('season').kelly_pnl.transform('cumsum')
In [11]:
fig = px.area(df_pnl, x='game_date', y='cum_kelly_pnl', color='season')
fig.update_yaxes(title='Profit / Loss')
fig.update_xaxes(title='Date')
In [ ]: