import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression


tennis_data = pd.read_csv('tennis_stats.csv')
tennis_data.head()


tennis_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1721 entries, 0 to 1720
Data columns (total 24 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Player                      1721 non-null   object 
 1   Year                        1721 non-null   int64  
 2   FirstServe                  1721 non-null   float64
 3   FirstServePointsWon         1721 non-null   float64
 4   FirstServeReturnPointsWon   1721 non-null   float64
 5   SecondServePointsWon        1721 non-null   float64
 6   SecondServeReturnPointsWon  1721 non-null   float64
 7   Aces                        1721 non-null   int64  
 8   BreakPointsConverted        1721 non-null   float64
 9   BreakPointsFaced            1721 non-null   int64  
 10  BreakPointsOpportunities    1721 non-null   int64  
 11  BreakPointsSaved            1721 non-null   float64
 12  DoubleFaults                1721 non-null   int64  
 13  ReturnGamesPlayed           1721 non-null   int64  
 14  ReturnGamesWon              1721 non-null   float64
 15  ReturnPointsWon             1721 non-null   float64
 16  ServiceGamesPlayed          1721 non-null   int64  
 17  ServiceGamesWon             1721 non-null   float64
 18  TotalPointsWon              1721 non-null   float64
 19  TotalServicePointsWon       1721 non-null   float64
 20  Wins                        1721 non-null   int64  
 21  Losses                      1721 non-null   int64  
 22  Winnings                    1721 non-null   int64  
 23  Ranking                     1721 non-null   int64  
dtypes: float64(12), int64(11), object(1)
memory usage: 322.8+ KB


tennis_data.isna().sum()

Player                        0
Year                          0
FirstServe                    0
FirstServePointsWon           0
FirstServeReturnPointsWon     0
SecondServePointsWon          0
SecondServeReturnPointsWon    0
Aces                          0
BreakPointsConverted          0
BreakPointsFaced              0
BreakPointsOpportunities      0
BreakPointsSaved              0
DoubleFaults                  0
ReturnGamesPlayed             0
ReturnGamesWon                0
ReturnPointsWon               0
ServiceGamesPlayed            0
ServiceGamesWon               0
TotalPointsWon                0
TotalServicePointsWon         0
Wins                          0
Losses                        0
Winnings                      0
Ranking                       0
dtype: int64


tennis_data['Player'].value_counts()

Ivan Dodig         9
Leonardo Mayer     9
Dudi Sela          9
Evgeny Donskoy     9
Go Soeda           9
                  ..
Gleb Sakharov      1
Vaclav Safranek    1
Alex de Minaur     1
Takao Suzuki       1
Kento Takeuchi     1
Name: Player, Length: 438, dtype: int64


tennis_data.describe()


tennis_data = tennis_data.drop_duplicates(subset='Player')


offensive_vars = tennis_data.columns[[7,12,2,3,5,9,11,16,17,19]]
fig, axes = plt.subplots(len(offensive_vars),4, figsize=(25, 60))
fig.subplots_adjust(hspace=0.3, wspace=0.2)
sns.set(font_scale=1.3)

for var in offensive_vars:
    sns.scatterplot(x=var, y='Wins', data=tennis_data, ax=axes[offensive_vars.tolist().index(var),0], alpha=0.4).set_title(f'{var} vs Wins', fontsize=18, weight='bold')
    sns.scatterplot(x=var, y='Losses', data=tennis_data, ax=axes[offensive_vars.tolist().index(var),1], alpha=0.4).set_title(f'{var} vs Losses', fontsize=18, weight='bold')
    sns.histplot(x=var, data=tennis_data, ax=axes[offensive_vars.tolist().index(var),2]).set_title('Distribution', fontsize=18, weight='bold')
    sns.boxplot(x=var, data=tennis_data, ax=axes[offensive_vars.tolist().index(var),3]).set_title('Outliers', fontsize=18, weight='bold')


defensive_vars = tennis_data.columns[[4,6,10,8,13,14,15,18]]
fig, axes = plt.subplots(len(defensive_vars),4, figsize=(25, 50))
fig.subplots_adjust(hspace=0.3, wspace=0.2)
sns.set(font_scale=1.3)

for var in defensive_vars:
    sns.scatterplot(x=var, y='Wins', data=tennis_data, ax=axes[defensive_vars.tolist().index(var),0], alpha=0.4).set_title(f'{var} vs Wins', fontsize=18, weight='bold')
    sns.scatterplot(x=var, y='Losses', data=tennis_data, ax=axes[defensive_vars.tolist().index(var),1], alpha=0.4).set_title(f'{var} vs Losses', fontsize=18, weight='bold')
    sns.histplot(x=var, data=tennis_data, ax=axes[defensive_vars.tolist().index(var),2]).set_title('Distribution', fontsize=18, weight='bold')
    sns.boxplot(x=var, data=tennis_data, ax=axes[defensive_vars.tolist().index(var),3]).set_title('Outliers', fontsize=18, weight='bold')


data = tennis_data.drop(['Player', 'Year'], axis=1).corr()
labels = round(data,2)

sns.set(font_scale=0.85, rc={'figure.figsize': (12,12)})
sns.heatmap(data, cmap='vlag', center=0, annot=labels).set_title('Tennis Data Linear Relationships Heatmap', weight='bold', fontsize=18)
plt.show()


X = tennis_data[['Aces', 'BreakPointsFaced', 'BreakPointsOpportunities', 'DoubleFaults', 'ReturnGamesPlayed','ServiceGamesPlayed']]
y = tennis_data['Winnings']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, test_size = 0.3, random_state=100)

lm = LinearRegression()
lm.fit(X_train, y_train);


y_pred = lm.predict(X_test) #Predicted outcome values using the model and the test data
residuals = y_test - y_pred


fig = plt.figure(figsize=(5,5))
plt.hist(residuals, bins=20)
plt.title('Distribution of Residuals', fontsize=12, weight='bold')
plt.ylabel('Frequency', fontsize=10)
plt.show()


import statsmodels.api as sm 

with plt.rc_context():
    plt.rc("figure", figsize=(5,5))
    fig = sm.qqplot(residuals, line='r') #by default compares to normal distribution, but can be changed
    plt.title('QQ-Plot of Residuals', fontsize=12, weight='bold')
plt.show()


from statsmodels.stats.outliers_influence import variance_inflation_factor
vif_data = pd.DataFrame({'Feature': X_test.columns, 'VIF': [variance_inflation_factor(X_test.values, i) for i in range(len(X_test.columns))]})
vif_data


fig = plt.figure(figsize=(5,5))
plt.scatter(y_test, residuals, alpha=0.4, s=15)
plt.plot(y_test, [0]*len(y_test), c='r')
plt.ylabel('Error size')
plt.xlabel('Outcome variable (Winnings, in millions)')
plt.title('Variance in Error of Residuals', fontsize=12, weight='bold')
plt.show()


fig, axes = plt.subplots(2,3, figsize=(15, 10))
fig.subplots_adjust(hspace=0.3, wspace=0.22)
sns.set(font_scale=0.8)

subplts = [axes[0,0], axes[0,1], axes[0,2], axes[1,0], axes[1,1], axes[1,2]]
colnames = X_test.columns

for i in range(len(colnames)):
    sns.scatterplot(x=X_test[colnames[i]], y=residuals, ax=subplts[i], alpha=0.4).set_title(f'{colnames[i]} vs \nResiduals (Winnings)', fontsize=12, weight='bold')
    sns.lineplot(x=X_test[colnames[i]], y=[0]*len(X_test[colnames[i]]), ax=subplts[i], color='r')
    subplts[i].set_yticks([-300000, -200000, -100000, 0, 100000, 200000, 300000],['-300K', '-200K', '-100K', '0', '100K', '200K', '300K'])
    subplts[i].set_ylabel('Residuals (Winnings)')


fig = plt.figure(figsize=(12,5))
sns.lineplot(x=residuals.index, y=residuals, linewidth=1)
plt.xlabel('Index of Residual')
plt.ylabel('Residuals (Winnings)')
plt.yticks([-300000, -200000, -100000, 0, 100000, 200000, 300000],['-300K', '-200K', '-100K', '0', '100K', '200K', '300K'])
plt.title('Autorrelation in Residuals', fontsize=12, weight='bold')

plt.show()


X = tennis_data[['Aces', 'BreakPointsFaced', 'BreakPointsOpportunities', 'DoubleFaults']]
y = tennis_data['Winnings']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, test_size = 0.3, random_state=100)

lm.fit(X_train, y_train);

vif_data = pd.DataFrame({'Feature': X_test.columns, 'VIF': [variance_inflation_factor(X_test.values, i) for i in range(len(X_test.columns))]})
vif_data


X = tennis_data[['Aces', 'DoubleFaults']]
y = tennis_data['Winnings']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, test_size = 0.3, random_state=100)

lm.fit(X_train, y_train);

vif_data = pd.DataFrame({'Feature': X_train.columns, 'VIF': [variance_inflation_factor(X_train.values, i) for i in range(len(X_train.columns))]})
vif_data


y_pred = lm.predict(X_test)
residuals = y_test - y_pred


fig, axes = plt.subplots(2,3, figsize=(15, 10))
fig.subplots_adjust(hspace=0.3, wspace=0.22)
sns.set(font_scale=0.8)

#Distribution of residuals
sns.histplot(residuals, ax=axes[0,0]).set_title('Distribution of Residuals', fontsize=12, weight='bold')
sm.qqplot(residuals, line='r', ax=axes[0,1], markersize=4)
axes[0,1].set_title('QQ Plot', fontsize=12, weight='bold')

#Homoscedasticity
sns.scatterplot(x=y_test, y=residuals, ax=axes[0,2], alpha=0.4).set_title('Homoscedasticity', fontsize=12, weight='bold')
axes[0,2].set_ylabel('Residuals')
sns.lineplot(x=y_test, y=[0]*len(y_test), ax=axes[0,2], color='r');

#Correlation between independent variables and residuals
sns.scatterplot(x=X_test['Aces'], y=residuals, ax=axes[1,0], alpha=0.4).set_title('Relationship between \nAces and Residuals (Winnings)', fontsize=12, weight='bold')
sns.lineplot(x=X_test['Aces'], y=[0]*len(X_test['Aces']), ax=axes[1,0], color='r');
sns.scatterplot(x=X_test['DoubleFaults'], y=residuals, ax=axes[1,1], alpha=0.4).set_title('Relationship between \nDoubleFaults and Residuals (Winnings)', fontsize=12, weight='bold')
sns.lineplot(x=X_test['DoubleFaults'], y=[0]*len(X_test['DoubleFaults']), ax=axes[1,1], color='r');

#Autocorrelation in residuals
sns.lineplot(x=residuals.index, y=residuals, ax=axes[1,2]).set_title('Autocorrelation of Residuals', fontsize=12, weight='bold');
axes[1,2].set_ylabel('Residuals')

subplts = [axes[0,1], axes[0,2], axes[1,0], axes[1,1], axes[1,2]]
colnames = X_test.columns

for i in range(len(subplts)):
    subplts[i].set_yticks([-400000, -200000, 0, 200000, 400000],['-400K', '-200K', '0', '200K', '400K'])


X_model = sm.add_constant(X_train)
sm_model = sm.OLS(y_train, X_model).fit()
sm_model.summary()


print(
    'Aces coefficient:\t\t', lm.coef_[0], '\n'
    'DoubleFaults coefficient:\t', lm.coef_[1], '\n'
    'Intercept:\t\t\t', lm.intercept_
)

Aces coefficient:		 536.2988660480429 
DoubleFaults coefficient:	 3173.6078643403675 
Intercept:			 40577.862306813855


cooks_dists = sm_model.get_influence().cooks_distance

fig, axes = plt.subplots(1,2, figsize=(10,5))
sns.scatterplot(x=X_model['Aces'], y=cooks_dists[0], alpha=0.4, ax=axes[0]).set_title('Cook\'s Distance for Aces', fontsize=12, weight='bold');
sns.scatterplot(x=X_model['DoubleFaults'], y=cooks_dists[0], alpha=0.4, ax=axes[1]).set_title('Cook\'s Distance for DoubleFaults', fontsize=12, weight='bold');
axes[0].set_ylabel('Distance');
axes[1].set_ylabel('Distance');


hat_diag = sm_model.get_influence().hat_matrix_diag
studentised_residuals = sm_model.get_influence().resid_studentized_external

fig = plt.figure(figsize=(5,5))
sns.regplot(x=hat_diag, y=studentised_residuals, fit_reg=False, scatter_kws={'alpha':0.4, 's':15}).set_title('Leverage v Studentised Residuals', fontsize=12, weight='bold')
plt.plot(hat_diag, [3]*len(hat_diag), c='r', alpha=0.4)
plt.plot(hat_diag, [-3]*len(hat_diag), c='r', alpha=0.4)
plt.ylabel('Studentised Residuals', fontsize=10)
plt.xlabel('Leverage', fontsize=10);


studentised_residuals = sm_model.get_influence().resid_studentized_external

df_st_resid = pd.DataFrame({'Index (Obs)': X_train.index, 'Stud Resid': studentised_residuals})

#   IMPORTANT: If you don't specify the index from the training set as a column (Index Obs), 
#   the index displayed in the results will correspond to the index of the `.redid_studentized_external` object, and NOT 
#   to the index of the original observation that was used to build the model, and you will wonder, for example, 
#   why the results show an index that doesn't reflect the original dataset, or why some plots show an index that doesn't correspond
#   to the size of the dataset used to create the model (see influence plot a couple of paragraphs below).

outliers = df_st_resid[abs(df_st_resid['Stud Resid']) > 3]
outliers.sort_values(by='Index (Obs)')


leverage_cutoff = ((2*len(lm.coef_))+2)/len(X_train) #about 0.0196 for this model

df_influence = pd.DataFrame({'Index (Obs)': X_train.index, 'Influence': hat_diag}) 
#   IMPORTANT: Again, if you don't specify the index from the training set as a column (Index Obs), 
#   the index displayed in the results will correspond to the index of the hat matrix, and NOT to the index of the original
#   observation that was used to build the model.

high_influence = df_influence[(df_influence['Influence'] > leverage_cutoff)]
high_influence.sort_values(by='Influence', ascending=False)


fig = plt.figure(figsize=(5,5))
sns.regplot(x=hat_diag, y=studentised_residuals, fit_reg=False, scatter_kws={'alpha':0.4, 's':15}).set_title('Leverage v Studentised Residuals', fontsize=12, weight='bold')
plt.plot(hat_diag, [3]*len(hat_diag), c='r', alpha=0.4)
plt.plot(hat_diag, [-3]*len(hat_diag), c='r', alpha=0.4)
plt.vlines(leverage_cutoff, ymin=-5, ymax=6, color='r', alpha=0.4)
plt.ylabel('Studentised Residuals', fontsize=10)
plt.xlabel('Leverage', fontsize=10);


fig, ax = plt.subplots(figsize=(8,8));
sm.graphics.influence_plot(sm_model, criterion='cooks', ax=ax, alpha=0.3, );


#Getting the indices to be removed
outliers_indices = outliers['Index (Obs)'].to_list()
high_influence_indices = high_influence['Index (Obs)'].to_list()
to_remove = list(set(outliers_indices + high_influence_indices))

#Removing these observations
for i in to_remove:
    X_train = X_train.drop(i)
    y_train = y_train.drop(i)


lm.fit(X_train, y_train);

y_pred = lm.predict(X_test)
residuals = y_test - y_pred


X_model = sm.add_constant(X_train)
sm_model = sm.OLS(y_train, X_model).fit()
sm_model.summary()


fig, axes = plt.subplots(2,3, figsize=(15, 10))
fig.subplots_adjust(hspace=0.3, wspace=0.22)
sns.set(font_scale=0.8)

#Distribution of residuals
sns.histplot(residuals, ax=axes[0,0]).set_title('Distribution of Residuals', fontsize=12, weight='bold')
sm.qqplot(residuals, line='r', ax=axes[0,1], markersize=4)
axes[0,1].set_title('QQ Plot', fontsize=12, weight='bold')

#Homoscedasticity
sns.scatterplot(x=y_test, y=residuals, ax=axes[0,2], alpha=0.4).set_title('Homoscedasticity', fontsize=12, weight='bold')
axes[0,2].set_ylabel('Residuals')
sns.lineplot(x=y_test, y=[0]*len(y_test), ax=axes[0,2], color='r');

#Correlation between independent variables and residuals
sns.scatterplot(x=X_test['Aces'], y=residuals, ax=axes[1,0], alpha=0.4).set_title('Relationship between \nAces and Residuals (Winnings)', fontsize=12, weight='bold')
sns.lineplot(x=X_test['Aces'], y=[0]*len(X_test['Aces']), ax=axes[1,0], color='r');
sns.scatterplot(x=X_test['DoubleFaults'], y=residuals, ax=axes[1,1], alpha=0.4).set_title('Relationship between \nDoubleFaults and Residuals (Winnings)', fontsize=12, weight='bold')
sns.lineplot(x=X_test['DoubleFaults'], y=[0]*len(X_test['DoubleFaults']), ax=axes[1,1], color='r');

#Autocorrelation in residuals
sns.lineplot(x=residuals.index, y=residuals, ax=axes[1,2]).set_title('Autocorrelation of Residuals', fontsize=12, weight='bold');
axes[1,2].set_ylabel('Residuals')

subplts = [axes[0,1], axes[0,2], axes[1,0], axes[1,1], axes[1,2]]
colnames = X_test.columns

for i in range(len(subplts)):
    subplts[i].set_yticks([-400000, -200000, 0, 200000, 400000],['-400K', '-200K', '0', '200K', '400K'])


fig, axes = plt.subplots(1,2, figsize=(10,5))
sns.scatterplot(x=X_test['Aces'], y=y_test, alpha=0.4, ax=axes[0]).set_title('Aces and Winnings\nin Test Dataset', fontsize=12, weight='bold');
sns.scatterplot(x=X_test['DoubleFaults'], y=y_test, alpha=0.4, ax=axes[1]).set_title('DoubleFaults and Winnings\nin Test Dataset', fontsize=12, weight='bold');


X_model = sm.add_constant(X)
sm_model = sm.OLS(y, X_model).fit()


hat_diag = sm_model.get_influence().hat_matrix_diag
studentised_residuals = sm_model.get_influence().resid_studentized_external

df_st_resid = pd.DataFrame({'Index (Obs)': X.index, 'Stud Resid': studentised_residuals})

outliers = df_st_resid[abs(df_st_resid['Stud Resid']) > 2]
outliers.sort_values(by='Index (Obs)')


leverage_cutoff = ((2*len(lm.coef_))+2)/len(X) #about 0.0137 this time

df_influence = pd.DataFrame({'Index (Obs)': X.index, 'Influence': hat_diag}) 

high_influence = df_influence[(df_influence['Influence'] > leverage_cutoff)]
high_influence.sort_values(by='Influence', ascending=False)


#Data
X = tennis_data[['Aces', 'DoubleFaults']]
y = tennis_data['Winnings']

#Getting the indices to be removed
outliers_indices = outliers['Index (Obs)'].to_list()
high_influence_indices = high_influence['Index (Obs)'].to_list()
to_remove = list(set(outliers_indices + high_influence_indices))

#Removing these observations
for i in to_remove:
    X = X.drop(i)
    y = y.drop(i)

#Split data, train, and test

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, test_size = 0.3, random_state=100)

lm.fit(X_train, y_train)

y_pred = lm.predict(X_test)
residuals = y_test - y_pred


#Visualise and Diagnose

fig, axes = plt.subplots(2,3, figsize=(15, 10))
fig.subplots_adjust(hspace=0.3, wspace=0.22)
sns.set(font_scale=0.8)

#Distribution of residuals
sns.histplot(residuals, ax=axes[0,0]).set_title('Distribution of Residuals', fontsize=12, weight='bold')
axes[0,0].set_xticks([-150000, -100000, -50000, 0, 50000, 100000, 150000],['-150K', '-100K', '-50K', '0', '50K', '100K', '150K'])
sm.qqplot(residuals, line='r', ax=axes[0,1], markersize=4)
axes[0,1].set_title('QQ Plot', fontsize=12, weight='bold')

#Homoscedasticity
sns.scatterplot(x=y_test, y=residuals, ax=axes[0,2], alpha=0.4).set_title('Homoscedasticity', fontsize=12, weight='bold')
axes[0,2].set_ylabel('Residuals')
sns.lineplot(x=y_test, y=[0]*len(y_test), ax=axes[0,2], color='r');

#Correlation between independent variables and residuals
sns.scatterplot(x=X_test['Aces'], y=residuals, ax=axes[1,0], alpha=0.4).set_title('Relationship between \nAces and Residuals (Winnings)', fontsize=12, weight='bold')
sns.lineplot(x=X_test['Aces'], y=[0]*len(X_test['Aces']), ax=axes[1,0], color='r');
sns.scatterplot(x=X_test['DoubleFaults'], y=residuals, ax=axes[1,1], alpha=0.4).set_title('Relationship between \nDoubleFaults and Residuals (Winnings)', fontsize=12, weight='bold')
sns.lineplot(x=X_test['DoubleFaults'], y=[0]*len(X_test['DoubleFaults']), ax=axes[1,1], color='r');

#Autocorrelation in residuals
sns.lineplot(x=residuals.index, y=residuals, ax=axes[1,2]).set_title('Autocorrelation of Residuals', fontsize=12, weight='bold');
axes[1,2].set_ylabel('Residuals')

subplts = [axes[0,1], axes[0,2], axes[1,0], axes[1,1], axes[1,2]]
colnames = X_test.columns

for i in range(len(subplts)):
    subplts[i].set_yticks([-400000, -200000, 0, 200000, 400000],['-400K', '-200K', '0', '200K', '400K'])


#Diagnose
X_model = sm.add_constant(X_train)
sm_model = sm.OLS(y_train, X_model).fit()
sm_model.summary()


fig = plt.figure(figsize=(5,5))
sns.regplot(x=y_test, y=y_pred, scatter_kws={'alpha':0.4, 's':15}).set_title('Actual v Predicted Winnings', fontsize=12, weight='bold')
plt.ylabel('Predicted Winnings', fontsize=10)
plt.xlabel('Actual Winnings', fontsize=10);

	Player	Year	FirstServe	FirstServePointsWon	FirstServeReturnPointsWon	SecondServePointsWon	SecondServeReturnPointsWon	Aces	BreakPointsConverted	BreakPointsFaced	...	ReturnGamesWon	ReturnPointsWon	ServiceGamesPlayed	ServiceGamesWon	TotalPointsWon	TotalServicePointsWon	Wins	Losses	Winnings	Ranking
0	Pedro Sousa	2016	0.88	0.50	0.38	0.50	0.39	0	0.14	7	...	0.11	0.38	8	0.50	0.43	0.50	1	2	39820	119
1	Roman Safiullin	2017	0.84	0.62	0.26	0.33	0.07	7	0.00	7	...	0.00	0.20	9	0.67	0.41	0.57	0	1	17334	381
2	Pedro Sousa	2017	0.83	0.60	0.28	0.53	0.44	2	0.38	10	...	0.16	0.34	17	0.65	0.45	0.59	4	1	109827	119
3	Rogerio Dutra Silva	2010	0.83	0.64	0.34	0.59	0.33	2	0.33	5	...	0.14	0.34	15	0.80	0.49	0.63	0	0	9761	125
4	Daniel Gimeno-Traver	2017	0.81	0.54	0.00	0.33	0.33	1	0.00	2	...	0.00	0.20	2	0.50	0.35	0.50	0	1	32879	272

	Year	FirstServe	FirstServePointsWon	FirstServeReturnPointsWon	SecondServePointsWon	SecondServeReturnPointsWon	Aces	BreakPointsConverted	BreakPointsFaced	BreakPointsOpportunities	...	ReturnGamesWon	ReturnPointsWon	ServiceGamesPlayed	ServiceGamesWon	TotalPointsWon	TotalServicePointsWon	Wins	Losses	Winnings	Ranking
count	1721.000000	1721.000000	1721.000000	1721.000000	1721.000000	1721.000000	1721.000000	1721.000000	1721.000000	1721.000000	...	1721.000000	1721.000000	1721.000000	1721.000000	1721.000000	1721.000000	1721.000000	1721.000000	1.721000e+03	1721.000000
mean	2013.646717	0.598053	0.680738	0.261673	0.479733	0.466432	97.105171	0.369407	112.003486	102.918071	...	0.173823	0.342208	197.650203	0.715590	0.473155	0.599245	7.876816	9.278908	2.344928e+05	269.610691
std	2.488018	0.054533	0.070422	0.056639	0.066902	0.068447	137.966077	0.162987	119.247651	122.761670	...	0.080880	0.049369	221.208703	0.123287	0.037139	0.057718	10.183716	8.996450	2.530537e+05	277.341947
min	2009.000000	0.360000	0.270000	0.000000	0.060000	0.000000	0.000000	0.000000	1.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.220000	0.250000	0.000000	0.000000	1.080000e+02	3.000000
25%	2012.000000	0.570000	0.650000	0.240000	0.460000	0.440000	7.000000	0.320000	15.000000	9.000000	...	0.130000	0.320000	22.000000	0.670000	0.460000	0.570000	0.000000	2.000000	4.931100e+04	83.000000
50%	2014.000000	0.600000	0.690000	0.270000	0.490000	0.480000	34.000000	0.380000	55.000000	41.000000	...	0.180000	0.350000	86.000000	0.750000	0.480000	0.610000	3.000000	5.000000	1.252120e+05	166.000000
75%	2016.000000	0.630000	0.720000	0.290000	0.520000	0.500000	140.000000	0.430000	201.000000	172.000000	...	0.220000	0.370000	348.000000	0.790000	0.500000	0.630000	13.000000	17.000000	3.500750e+05	333.000000
max	2017.000000	0.880000	0.890000	0.480000	0.920000	0.750000	1185.000000	1.000000	507.000000	573.000000	...	0.560000	0.510000	916.000000	1.000000	0.820000	0.820000	48.000000	36.000000	1.074562e+06	1443.000000

Dep. Variable:	Winnings	R-squared:	0.813
Model:	OLS	Adj. R-squared:	0.812
Method:	Least Squares	F-statistic:	659.2
Date:	Tue, 11 Oct 2022	Prob (F-statistic):	4.34e-111
Time:	18:07:23	Log-Likelihood:	-3963.6
No. Observations:	306	AIC:	7933.
Df Residuals:	303	BIC:	7944.
Df Model:	2
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
const	4.058e+04	6876.396	5.901	0.000	2.7e+04	5.41e+04
Aces	536.2989	83.237	6.443	0.000	372.504	700.094
DoubleFaults	3173.6079	243.427	13.037	0.000	2694.586	3652.630

Omnibus:	143.150	Durbin-Watson:	1.977
Prob(Omnibus):	0.000	Jarque-Bera (JB):	1056.680
Skew:	1.763	Prob(JB):	3.51e-230
Kurtosis:	11.393	Cond. No.	185.

Multiple Linear Regression Using ATP Men's League Data¶

Overview¶

Identifying Data:¶

"Offensive" (Service Game) Columns:¶

"Defensive" (Return Game) Columns:¶

Outcomes:¶

Step 1 - Inital Exploratory Data Analysis¶

Step 2 - Build an Initial Model to Check Performance and Assumptions.¶

Step 3 - Diagnosing and Correcting the Model¶

Step 4 - Removing Influential Points to Improve the Model¶

	Feature	VIF
0	Aces	16.079797
1	BreakPointsFaced	76.148170
2	BreakPointsOpportunities	43.635450
3	DoubleFaults	16.608583
4	ReturnGamesPlayed	7380.594760
5	ServiceGamesPlayed	7620.490832

	Feature	VIF
0	Aces	4.030206
1	BreakPointsFaced	27.177893
2	BreakPointsOpportunities	19.802520
3	DoubleFaults	14.101704

	Index (Obs)	Stud Resid
206	107	-4.821700
281	123	3.664859
250	313	4.512081
35	325	5.186609
107	555	5.671448
262	559	4.218032
10	892	-3.409213
88	982	3.000684
235	1103	4.171207

	Index (Obs)	Influence
206	107	0.404833
55	49	0.153951
90	1573	0.129802
222	336	0.110273
58	63	0.102671
18	561	0.093042
62	705	0.077757
19	927	0.055980
46	725	0.054473
24	221	0.052040
51	361	0.038056
215	430	0.036055
187	732	0.034388
195	562	0.033957
235	1103	0.032844
137	101	0.032379
139	1005	0.029607
122	531	0.028054
40	612	0.027748
303	924	0.024581
37	388	0.023643
250	313	0.023528
79	554	0.021527

Omnibus:	39.524	Durbin-Watson:	1.920
Prob(Omnibus):	0.000	Jarque-Bera (JB):	208.644
Skew:	0.373	Prob(JB):	4.94e-46
Kurtosis:	7.186	Cond. No.	94.0

	Index (Obs)	Stud Resid
44	56	2.739897
52	71	2.768442
74	101	2.275342
76	107	-4.931413
86	123	3.636454
132	221	-2.549095
160	288	2.098548
161	289	4.115735
166	308	5.263617
167	313	4.584519
171	325	5.115658
205	403	-4.355229
216	432	2.662136
228	483	2.173804
248	555	5.619228
251	559	4.238278
252	561	-2.332369
269	624	2.929817
281	672	-4.376502
284	696	2.852104
295	732	2.419758
296	735	-2.129421
320	892	-3.360578
331	982	3.059259
344	1076	2.993149
347	1103	4.270050
403	1473	-2.579514

	Index (Obs)	Influence
76	107	0.327400
281	672	0.144068
40	49	0.127635
176	336	0.089514
415	1573	0.081889
285	705	0.065956
48	63	0.062699
252	561	0.057447
284	696	0.048018
342	1068	0.044198
324	927	0.034569
132	221	0.034551
293	725	0.034326
158	284	0.032214
295	732	0.028805
403	1473	0.028789
253	562	0.026947
166	308	0.026098
74	101	0.026080
265	612	0.023696
188	361	0.023148
214	430	0.022738
205	403	0.021963
333	1005	0.021381
296	735	0.020666
347	1103	0.020288
64	85	0.019682
323	924	0.018797
242	531	0.018779
136	231	0.018441
247	554	0.018314
199	388	0.016137
370	1222	0.015783
304	793	0.015362
167	313	0.014513

Omnibus:	18.608	Durbin-Watson:	1.989
Prob(Omnibus):	0.000	Jarque-Bera (JB):	51.668
Skew:	0.188	Prob(JB):	6.03e-12
Kurtosis:	5.102	Cond. No.	80.7

	coef	std err	t	P>\|t\|	[0.025	0.975]
const	2.735e+04	4470.318	6.118	0.000	1.86e+04	3.62e+04
Aces	1007.3281	149.297	6.747	0.000	713.413	1301.243
DoubleFaults	2738.2429	378.121	7.242	0.000	1993.851	3482.635

	coef	std err	t	P>\|t\|	[0.025	0.975]
const	2.669e+04	3551.116	7.515	0.000	1.97e+04	3.37e+04
Aces	492.2228	122.894	4.005	0.000	250.267	734.179
DoubleFaults	3421.4952	291.065	11.755	0.000	2848.440	3994.551