from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.model_selection import TimeSeriesSplit, train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.io as pio
pio.templates.default = "plotly_dark"
from tqdm.auto import tqdm
df = pd.read_csv('https://raw.githubusercontent.com/wsovine/data/main/nhl_moneyline_model_logistic_regression_2022_season.csv')
df.sort_values('game_date', inplace=True)
df.reset_index(inplace=True, drop=True)
X = df.copy()
X = X.drop(['home', 'away', 'game_date', 'season',
'home_goals', 'away_goals', 'home_win'], axis=1)
y = df.home_win.values
print(X.shape)
tss = TimeSeriesSplit(n_splits=25)
(1399, 92)
Functions to scale predictors and add one-hot encode categorical variables
def scale(X_train, X_test) -> (pd.DataFrame, pd.DataFrame):
scaler = StandardScaler()
X_train.loc[:,:] = scaler.fit_transform(X_train)
X_test.loc[:,:] = scaler.transform(X_test)
return (X_train, X_test)
def dummy(X_train, X_test, original_df) -> (pd.DataFrame, pd.DataFrame):
home_dummies = pd.get_dummies(original_df.home, prefix='home')
X_train = X_train.join(home_dummies, how='inner')
X_test = X_test.join(home_dummies, how='inner')
away_dummies = pd.get_dummies(original_df.away, prefix='away')
X_train = X_train.join(away_dummies, how='inner')
X_test = X_test.join(away_dummies, how='inner')
return (X_train, X_test)
def scale_and_dummy(X_train, X_test, original_df) -> (pd.DataFrame, pd.DataFrame):
X_train, X_test = scale(X_train, X_test)
X_train, X_test = dummy(X_train, X_test, original_df)
return (X_train, X_test)
Functions for calculations that involve the odds and calculating profit
# This function converts american odds to win probability
def odds_to_profit(odds, u = 1):
if odds < 0:
profit = -100 / odds
else:
profit = odds / 100
return profit * u
def odds_to_loss(odds, u = 1):
if odds < 0:
loss = odds / -100
else:
loss = 100 / odds
return -loss * u
def odds_to_prob(odds):
if odds < 0:
prob = odds / (odds - 100)
else:
prob = 100 / (odds + 100)
return prob
def odds_to_dec(odds):
if odds < 0:
dec = (100 / -odds)
else:
dec = (odds / 100)
return dec #+ 1
def profitize(df, aggregated=True, kelly_scale=100):
df['away_win'] = np.where(df.home_win, 0, 1)
df['home_decimal_odds'] = df.home_consensus.apply(lambda o: odds_to_dec(o))
df['away_decimal_odds'] = df.away_consensus.apply(lambda o: odds_to_dec(o))
# Determine the profit or loss for a bet on either team in each matchup
df['bet_on_home'] = np.where(
df.home_prob > df.home_implied_prob, 1, 0
)
df['bet_on_away'] = np.where(
df.away_prob > df.away_implied_prob, 1, 0
)
# Flat 1u bet
df['home_profit_if_win'] = df.home_consensus.apply(lambda o: odds_to_profit(o))
df['away_profit_if_win'] = df.away_consensus.apply(lambda o: odds_to_profit(o))
df['home_pnl_if_bet'] = np.where(df.home_win, df.home_profit_if_win, -1)
df['away_pnl_if_bet'] = np.where(df.away_win, df.away_profit_if_win, -1)
df['flat_1u_pnl'] = (df.bet_on_home * df.home_pnl_if_bet) + (df.bet_on_away * df.away_pnl_if_bet)
# Bet to win 1u
df['home_loss_if_lose'] = df.home_consensus.apply(lambda o: odds_to_loss(o))
df['away_loss_if_lose'] = df.away_consensus.apply(lambda o: odds_to_loss(o))
df['home_pnl_if_bet'] = np.where(df.home_win, 1, df.home_loss_if_lose)
df['away_pnl_if_bet'] = np.where(df.away_win, 1, df.away_loss_if_lose)
df['win_1u_pnl'] = (df.bet_on_home * df.home_pnl_if_bet) + (df.bet_on_away * df.away_pnl_if_bet)
# Kelly Criterion
df['home_kelly_bet'] = np.where(
df.home_prob > df.home_implied_prob,
(df.home_prob - df.away_prob / df.home_decimal_odds) * kelly_scale,
0)
df['away_kelly_bet'] = np.where(
df.away_prob > df.away_implied_prob,
(df.away_prob - df.home_prob / df.away_decimal_odds) * kelly_scale,
0)
df['home_profit_if_win'] = df.apply(lambda r: odds_to_profit(r.home_consensus, r.home_kelly_bet), axis=1)
df['away_profit_if_win'] = df.apply(lambda r: odds_to_profit(r.away_consensus, r.away_kelly_bet), axis=1)
df['home_pnl_if_bet'] = np.where(df.home_win, df.home_profit_if_win, df.home_kelly_bet * -1)
df['away_pnl_if_bet'] = np.where(df.away_win, df.away_profit_if_win, df.away_kelly_bet * -1)
df['kelly_pnl'] = (df.bet_on_home * df.home_pnl_if_bet) + (df.bet_on_away * df.away_pnl_if_bet)
if aggregated:
return df.flat_1u_pnl.sum(), df.win_1u_pnl.sum(), df.kelly_pnl.sum()
return df.flat_1u_pnl, df.win_1u_pnl, df.kelly_pnl
results = []
for i, (train_index, test_index) in enumerate(tss.split(X)):
X_test = X.iloc[test_index]
y_test = y[test_index]
y_pred = np.where(X_test.home_implied_prob >= .5, 1, 0)
y_prob = X_test.home_implied_prob
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_prob)
results.append({
# 'Fold': i,
'Accuracy': accuracy,
'ROC AUC': roc_auc
})
df_results = pd.DataFrame(results)
print(f'Average Accuracy: {df_results.Accuracy.mean()}')
print(f"Average ROC AUC: {df_results['ROC AUC'].mean()}")
display(df_results.style.background_gradient(vmin=0, vmax=1, cmap="viridis"))
Average Accuracy: 0.5894339622641509 Average ROC AUC: 0.6569917363125215
| Accuracy | ROC AUC | |
|---|---|---|
| 0 | 0.566038 | 0.555714 |
| 1 | 0.528302 | 0.697143 |
| 2 | 0.509434 | 0.505000 |
| 3 | 0.622642 | 0.677857 |
| 4 | 0.528302 | 0.617302 |
| 5 | 0.603774 | 0.746334 |
| 6 | 0.584906 | 0.671652 |
| 7 | 0.603774 | 0.688571 |
| 8 | 0.622642 | 0.725714 |
| 9 | 0.509434 | 0.695714 |
| 10 | 0.566038 | 0.644286 |
| 11 | 0.603774 | 0.656429 |
| 12 | 0.679245 | 0.678986 |
| 13 | 0.603774 | 0.514493 |
| 14 | 0.566038 | 0.631696 |
| 15 | 0.603774 | 0.609687 |
| 16 | 0.547170 | 0.625000 |
| 17 | 0.603774 | 0.718661 |
| 18 | 0.566038 | 0.618116 |
| 19 | 0.622642 | 0.716236 |
| 20 | 0.603774 | 0.669516 |
| 21 | 0.735849 | 0.816667 |
| 22 | 0.698113 | 0.784058 |
| 23 | 0.566038 | 0.764663 |
| 24 | 0.490566 | 0.395299 |
results = []
coefs = []
for train_index, test_index in tqdm(tss.split(X), total=tss.n_splits):
X_train = X.iloc[train_index].copy()
X_test = X.iloc[test_index].copy()
X_train, X_test = scale_and_dummy(X_train, X_test, df)
y_train = y[train_index]
y_test = y[test_index]
logr = LogisticRegressionCV(solver='liblinear')
logr.fit(X_train, y_train)
y_pred = logr.predict(X_test)
y_prob = logr.predict_proba(X_test)[:,1]
df_test = df.iloc[test_index].copy()
df_test['home_prob'] = y_prob
df_test['away_prob'] = 1 - df_test.home_prob
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_prob)
flat_1u_pnl, win_1u_pnl, kelly_pnl = profitize(df_test, kelly_scale=100)
results.append({
'C': logr.C_,
'Accuracy': accuracy,
'ROC AUC': roc_auc,
'Flat 1u PNL': flat_1u_pnl,
'Win 1u PNL': win_1u_pnl,
'Kelly PNL': kelly_pnl
})
coefs.append(
pd.DataFrame(
zip(logr.feature_names_in_, logr.coef_[0]),
columns=['feature', 'coef']
)
)
df_results = pd.DataFrame(results)
print(f'Average Accuracy: {df_results.Accuracy.mean()}')
print(f"Average ROC AUC: {df_results['ROC AUC'].mean()}")
print(f"Flat 1u PNL: {df_results['Flat 1u PNL'].sum()}")
print(f"Win 1u PNL: {df_results['Win 1u PNL'].sum()}")
print(f"Kelly PNL: {df_results['Kelly PNL'].sum()}")
display(df_results.style.background_gradient(vmin=0, vmax=1, cmap="viridis"))
0%| | 0/25 [00:00<?, ?it/s]
Average Accuracy: 0.6022641509433962 Average ROC AUC: 0.651997574729799 Flat 1u PNL: 27.120780967526894 Win 1u PNL: 33.702595095243325 Kelly PNL: 3.021424688039474
| C | Accuracy | ROC AUC | Flat 1u PNL | Win 1u PNL | Kelly PNL | |
|---|---|---|---|---|---|---|
| 0 | [0.35938137] | 0.471698 | 0.438571 | -5.776715 | -9.966163 | -349.027682 |
| 1 | [0.00077426] | 0.603774 | 0.672857 | 2.298095 | 4.492207 | -72.754322 |
| 2 | [0.00077426] | 0.433962 | 0.480000 | 2.649231 | -2.562134 | 194.903584 |
| 3 | [0.00077426] | 0.584906 | 0.668571 | -6.033787 | -2.979082 | -109.172533 |
| 4 | [0.0001] | 0.584906 | 0.609971 | 2.990000 | 1.096302 | 102.046840 |
| 5 | [0.00077426] | 0.660377 | 0.730205 | 3.690484 | 6.535655 | 36.681473 |
| 6 | [0.0001] | 0.660377 | 0.673789 | 0.102381 | -0.693778 | 25.834222 |
| 7 | [0.0001] | 0.622642 | 0.687143 | -1.137619 | 0.363564 | -14.487678 |
| 8 | [0.0001] | 0.660377 | 0.740000 | -8.330952 | -2.445655 | -185.463058 |
| 9 | [0.0001] | 0.603774 | 0.702857 | 4.316075 | 5.316572 | -9.910492 |
| 10 | [0.0001] | 0.566038 | 0.638571 | -3.478528 | -1.514334 | -42.217336 |
| 11 | [0.00077426] | 0.641509 | 0.660000 | 5.252660 | 4.839349 | 26.842245 |
| 12 | [0.00599484] | 0.566038 | 0.682609 | -6.012071 | -2.417448 | -51.766045 |
| 13 | [0.00599484] | 0.566038 | 0.528986 | 10.934954 | 4.916761 | 57.730433 |
| 14 | [0.00077426] | 0.584906 | 0.623512 | 6.427688 | 5.441498 | 68.806396 |
| 15 | [0.00077426] | 0.584906 | 0.608262 | 5.525149 | 1.073741 | 71.747317 |
| 16 | [0.00077426] | 0.566038 | 0.626488 | 15.041337 | 11.492202 | 115.592080 |
| 17 | [0.00077426] | 0.641509 | 0.739316 | -5.543025 | -0.760136 | -39.132553 |
| 18 | [0.00599484] | 0.603774 | 0.659420 | 13.121338 | 8.414751 | 219.898644 |
| 19 | [0.00077426] | 0.603774 | 0.731322 | 5.481316 | 9.831838 | 29.108564 |
| 20 | [0.00077426] | 0.622642 | 0.676638 | -7.621014 | -8.353649 | -27.779952 |
| 21 | [0.00077426] | 0.792453 | 0.811594 | -10.724180 | -4.130716 | -53.483497 |
| 22 | [0.00077426] | 0.735849 | 0.802899 | -9.417097 | -4.942845 | -70.578776 |
| 23 | [0.00599484] | 0.641509 | 0.733138 | 2.964514 | 6.827927 | 60.380665 |
| 24 | [0.00077426] | 0.452830 | 0.373219 | 10.400548 | 3.826171 | 19.222883 |
pd.concat(coefs, axis=0).groupby('feature').agg(
{'coef': ['mean', 'median', 'std']}
).sort_values(('coef', 'median'), key=abs, ascending=False).head()
| coef | |||
|---|---|---|---|
| mean | median | std | |
| feature | |||
| home_goal_diff_offense_str | -0.040995 | -0.028893 | 0.055133 |
| away_goal_diff_defense_str | -0.017175 | -0.020192 | 0.048657 |
| home_implied_prob | 0.018102 | 0.013422 | 0.019904 |
| away_implied_prob | -0.016325 | -0.013054 | 0.015817 |
| blocks_hia | -0.017953 | -0.012090 | 0.028508 |
dfs = []
game_dates = df.groupby(['season', 'game_date']).count().reset_index()[['season', 'game_date']]
kelly_bankroll = 0
all_game_dates = tqdm(game_dates.itertuples(), total=game_dates.shape[0])
for scope in all_game_dates:
y_train = y[(df.game_date < scope.game_date) & (df.season == scope.season)]
if (len(y_train) >= 16
and y_train[y_train == 0].shape[0] > 5
and y_train[y_train == 1].shape[0] > 5):
X_train = X[(df.game_date < scope.game_date) & (df.season == scope.season)].copy()
df_test = df[df.game_date == scope.game_date]
X_test = X.iloc[df_test.index].copy()
df_test = df_test[['season', 'game_date', 'home', 'away', 'home_goals', 'away_goals', 'home_win', 'home_consensus', 'away_consensus', 'home_implied_prob', 'away_implied_prob']]
df_test['away_win'] = np.where(df_test.home_win, 0, 1)
X_train, X_test = scale_and_dummy(X_train, X_test, df)
logr = LogisticRegression(C=.00077426, solver='liblinear')
logr.fit(X_train, y_train)
y_pred = logr.predict(X_test)
y_prob = logr.predict_proba(X_test)[:,1]
df_test['home_prob'] = y_prob
df_test['away_prob'] = 1 - df_test.home_prob
flat_1u_pnl, win_1u_pnl, kelly_pnl = profitize(df_test, aggregated=False, kelly_scale=100)
df_test['flat_1u_pnl'] = flat_1u_pnl
df_test['win_1u_pnl'] = win_1u_pnl
df_test['kelly_pnl'] = kelly_pnl
kelly_bankroll += kelly_pnl.sum()
all_game_dates.set_description(f'{scope.game_date} | ${kelly_bankroll:.0f}')
dfs.append(df_test)
df_test = pd.concat(dfs)
# display(df_test)
0%| | 0/221 [00:00<?, ?it/s]
df_pnl = df_test.copy()
df_pnl['game_date'] = pd.to_datetime(df_pnl.game_date, format='%Y%m%d')
df_pnl['cum_flat_1u_pnl'] = df_pnl.groupby('season').flat_1u_pnl.transform('cumsum')
df_pnl['cum_win_1u_pnl'] = df_pnl.groupby('season').win_1u_pnl.transform('cumsum')
df_pnl['cum_kelly_pnl'] = df_pnl.groupby('season').kelly_pnl.transform('cumsum')
fig = px.area(df_pnl, x='game_date', y='cum_kelly_pnl', color='season')
fig.update_yaxes(title='Profit / Loss')
fig.update_xaxes(title='Date')