import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import time 
warnings.filterwarnings('ignore')
import shap
shap.initjs()
from scipy.stats import zscore

sns.set_theme(
    context='talk',
    font_scale=0.7,
    palette = ['#0F00F5', '#3061FF', '#9AB1FF', '#CDD9FF', '#E6ECFF','#E5E5E5',
               '#B6BBCB', '#878B98','#696A6F','#292A2E'],
    style = {
        'axes.facecolor': '#FFFFFF',
         'axes.edgecolor': '#000000',
         'axes.legend.edgecolor':'#FFFFFF',
         'axes.grid': False,
         'axes.axisbelow': 'line',
         'axes.labelcolor': 'black',
         'figure.facecolor': '#FFFFFF',
         'grid.color': '#b0b0b0',
         'grid.linestyle': '-',
         'text.color': 'black',
         'xtick.color': 'black',
         'ytick.color': 'black',
         'xtick.direction': 'out',
         'ytick.direction': 'out',
         'patch.edgecolor': '#FFFFFF',
         'patch.force_edgecolor': True,
         'image.cmap': 'viridis',
         'font.family': ['sans-serif'],
         'font.sans-serif': 'Helvetica Neue',
         'xtick.bottom': False,
         'xtick.top': False,
         'ytick.left': False,
         'ytick.right': False,
         'axes.spines.left': False,
         'axes.spines.bottom': False,
         'axes.spines.right': False,
         'axes.spines.top': False
    }
)


cars_df_full = pd.read_pickle('data/cars_data.pkl')
cars_df_full.head()


#drop some missing data and columns we won't use
cars_df = cars_df_full.dropna(how='any', subset=['price', 'year', 'manufacturer', 'model', 'odometer']).reset_index(drop=True).drop(columns=['lat', 'long', 'posting_date', 'county', 'size', 'paint_color', 'region'])

#discard cars w/o price
cars_df = cars_df[cars_df['price'] > 0]

#remove price outliers (top 10 and bottom 10 deciles)
cars_df['decile'] = pd.qcut(cars_df['price'], 10, labels=range(1,11))
cars_df = cars_df[cars_df['decile'].isin([1,10])==False]
cars_df = cars_df.drop(columns='decile')

#remove odometer 1% outliers
cars_df['odometer_centile'] = pd.qcut(cars_df['odometer'], 100, labels=range(1,101))
cars_df = cars_df[cars_df['odometer_centile'] <= 99].drop(columns='odometer_centile')

#remove models that appear less than 5 times
counts_df = cars_df['model'].value_counts().reset_index().loc[:,['index', 'model']]
model_counts_dict = dict(zip(counts_df['index'], counts_df['model']))
cars_df['model_count'] = cars_df['model'].map(model_counts_dict)
cars_df = cars_df[cars_df['model_count'] >= 5]


#Check % of missingness in remaining df
round(cars_df.isna().sum() * 100 / len(cars_df),2)

price            0.00
year             0.00
manufacturer     0.00
model            0.00
condition       37.08
cylinders       41.70
fuel             0.31
odometer         0.00
title_status     1.32
transmission     0.42
drive           30.06
type            19.74
state            0.00
model_count      0.00
dtype: float64


cylinders_df = cars_df[['model', 'cylinders']]
cylinders_df.dropna(inplace=True)
cylinders_df.drop_duplicates()
cylinders_dict = dict(zip(cylinders_df['model'], cylinders_df['cylinders']))
cars_df['cylinders'] = cars_df['model'].map(cylinders_dict)
cars_df['cylinders'] = cars_df['cylinders'].str.extract('(\d{1,2})')
cars_df['cylinders'] = pd.to_numeric(cars_df['cylinders'], downcast='signed')

drive_df = cars_df[['model', 'drive']]
drive_df.dropna(inplace=True)
drive_df.drop_duplicates()
drive_dict = dict(zip(drive_df['model'], drive_df['drive']))
cars_df['drive'] = cars_df['model'].map(drive_dict)

type_df = cars_df[['model', 'type']]
type_df.dropna(inplace=True)
type_df.drop_duplicates()
type_dict = dict(zip(type_df['model'], type_df['type']))
cars_df['type'] = cars_df['model'].map(type_dict)

fuel_dict = dict(zip(cars_df.groupby('fuel').mean().reset_index()['fuel'].tolist(), list(reversed(range(5)))))
cars_df['fuel'] = cars_df['fuel'].map(fuel_dict)

cars_df['title_status'] = cars_df['title_status'].fillna('missing')

title_dict = {
    'clean': 5,
    'lien': 4,
    'rebuilt': 3,
    'salvage': 2,
    'missing': 1,
    'parts only': 0 
}

cars_df['title_status'] = cars_df['title_status'].map(title_dict)


condition_dict = {
    'new': 5,
    'like new': 4,
    'excellent': 3,
    'good': 2,
    'fair': 1,
    'salvage': 0
}
cars_df['condition'] = cars_df['condition'].map(condition_dict)
cars_df['condition'] = cars_df['condition'].fillna(0)


#Check missingness after imputation
round(cars_df.isna().sum() * 100 / len(cars_df),2)

price           0.00
year            0.00
manufacturer    0.00
model           0.00
condition       0.00
cylinders       8.33
fuel            0.31
odometer        0.00
title_status    0.00
transmission    0.42
drive           3.87
type            0.72
state           0.00
model_count     0.00
dtype: float64


#drop duplicates across the board
cars_df.drop_duplicates(inplace=True)

cars_df.reset_index(drop=True, inplace=True)

cars_df.head()


fig, axes = plt.subplots(3,5, figsize=(20,15))
sample_data = cars_df.sample(frac=.1, random_state=0)

#replace these vars with numbers to make cleaner plots and avoid overlapping labels. We just want to see distributions after all
for var in ['manufacturer', 'model', 'type', 'state']: 
    sample_data[var] = sample_data[var].map(dict(zip(sample_data[var].unique(), range(len(sample_data[var].unique())))))

for col, ax in zip(cars_df.columns, axes.ravel()):
    sns.histplot(data=sample_data[col], ax=ax).set_title(col, size=15)
    ax.set_xlabel('')
    ax.set_ylabel('')

plt.subplots_adjust(hspace=.5, wspace=.3)


#Scatterplots
corr_vars = ['price', 'year', 'condition', 'cylinders', 'odometer', 'title_status']

vars_a, vars_b = [], []

for i in corr_vars:
    for j in corr_vars:
        vars_a.append(i)
        vars_b.append(j)

vars_df = pd.DataFrame({'a': vars_a, 'b': vars_b})

fig, axes = plt.subplots(len(corr_vars),len(corr_vars), figsize=(15,15))
for i, ax in zip(range(len(vars_df)), axes.ravel()):
    x = vars_df.iloc[i]['a']
    y = vars_df.iloc[i]['b']
    sns.scatterplot(data=sample_data, x=x, y=y, ax=ax, s=2, alpha=.1).set_title(f'{x}\nvs\n{y}', fontsize=10)
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.set_xticklabels('')
    ax.set_yticklabels('')
    ax.legend([],[], frameon=False)

plt.subplots_adjust(hspace=1.2, wspace=.5)


#Heatmaps for Pearson and Spearman coefficients
cars_corr = cars_df.loc[:,corr_vars].corr() 
cars_corr2 = cars_df.loc[:,corr_vars].corr(method='spearman') 
corr_labels = round(cars_corr, 2)
corr_labels2 = round(cars_corr2, 2)

fig, axes = plt.subplots(1,2,figsize=(18,8))
sns.heatmap(cars_corr, annot=corr_labels, cmap='vlag', center=0, ax=axes[0], cbar=False).set_title('Pearson', size=15)
sns.heatmap(cars_corr2, annot=corr_labels2, cmap='vlag', center=0, ax=axes[1], cbar=False).set_title('Spearman', size=15)

plt.subplots_adjust(wspace=.3)


from sklearn.model_selection import train_test_split

#Work with sample
cars_df = cars_df.sample(frac=.1, random_state=0).reset_index(drop=True)

#Split data
X = cars_df.drop(columns=['price', 'manufacturer', 'model', 'transmission', 'drive', 'type', 'state', 'model_count'])
y = cars_df['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=.7, random_state=0)


# Load the model from the file
import joblib
xgb_model = joblib.load('best_model_final.pkl')

# Predict with the  model
y_pred = xgb_model.predict(X_test)


from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2= r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f'MSE:\t{mse:.1f}\nR2:\t{r2:.3f}\nMAE:\t{mae:.1f}')

MSE:	23629481.2
R2:	0.741
MAE:	3639.6


fig, axes = plt.subplots(1,1,figsize=(7,5))

sns.scatterplot(x=y_test, y=y_pred, alpha=.1, s=10, ax=axes).set_title('Real vs Predicted Prices (Test Set)', size=15)

plt.xlabel('Real Prices (y_test)')
plt.ylabel('Predicted Prices (y_pred)');


price_diff = y_test - y_pred

price_errors = pd.DataFrame({
    'Real Price': y_test,
    'Predicted Price': y_pred,
    'Price Difference': price_diff,
    'Absolute Difference': abs(price_diff)
})

fig, axes = plt.subplots(1,2,figsize=(17,5))

sns.histplot(data=price_errors, x='Price Difference', bins=100, ax=axes[0]).set_title('Distribution of Prediction Errors (Test Set)', size=15);
sns.histplot(data=price_errors, x='Absolute Difference', bins=100, ax=axes[1]).set_title('Distribution of Prediction Absolute Errors (Test Set)', size=15);


outlier_index = price_errors[price_errors['Absolute Difference'] == price_errors['Absolute Difference'].max()].index[0]

price_errors[price_errors['Absolute Difference'] == price_errors['Absolute Difference'].max()]


price_errors['Error Bin'] = [
    '-20k+' if i <= -20000 else 
    '-20k to -10k' if i > -20000 and i <= -10000 else
    '-10k to -5k' if i > -10000 and i <= -5000 else
    '-5k to -1k' if i > -5000 and i <= -1000 else
    '-1k to -500' if i > -1000 and i <= -500 else
    '-500 to -100' if i > -500 and i <= -100 else
    '-100 to 0' if i > -500 and i <= 0 else
    '0 to 100' if i > 0 and i <= 100 else
    '100 to 500' if i > 100 and i <= 500 else
    '500 to 1k' if i > 500 and i <= 1000 else
    '1k to 5k' if i > 1000 and i <= 5000 else
    '5k to 10k' if i > 5000 and i <= 10000 else
    '10k to 20k' if i > 10000 and i <= 20000 else
    '20k+'
    for i in price_errors['Price Difference']
]

price_errors_grouped = price_errors.groupby('Error Bin').count().reset_index().iloc[:,:2]
price_errors_grouped.columns = ['Error Bin', 'Num Errors']
price_errors_grouped['Percentage'] = price_errors_grouped['Num Errors'] * 100 / price_errors_grouped['Num Errors'].sum()

fig, axes = plt.subplots(1,1,figsize=(7,5))

error_bin_order = price_errors.sort_values(by='Price Difference')['Error Bin'].unique().tolist()

sns.barplot(
    data=price_errors_grouped,
    x='Percentage',
    y='Error Bin',
    ax=axes,
    order=error_bin_order,
    color=sns.color_palette()[0]
).set_title('Frequency and Size of Errors (Test Set)', size=15);

for container in axes.containers:
    values = container.datavalues
    bar_labels = [price_errors_grouped[price_errors_grouped['Percentage']==value]['Percentage'].tolist()[0] for value in values]
    axes.bar_label(container, 
                  labels = [f'{i:.2f}%' for i in bar_labels],
                  padding=5,
                  size=11)

plt.xlim(0,45)

xlabels = axes.get_xticks()
axes.set_xticks(ticks=xlabels, labels=[f'{i:.0f}%' if i%20==0 else '' for i in xlabels]);

plt.ylabel('Prediction Error');
plt.xlabel('Frequency');


### Calculations
#MDI
mdi_importances = pd.DataFrame({
    'Feature': xgb_model.feature_names_in_, 
    'Importance': xgb_model.feature_importances_
}).sort_values(by='Importance', ascending=False).reset_index(drop=True)

# Permutation
from sklearn.inspection import permutation_importance

perm_importances = permutation_importance(
    xgb_model, X_test, y_test, n_repeats=10, random_state=0, n_jobs=-1
)
sorted_importances_idx = perm_importances.importances_mean.argsort()

permutation_importances = pd.DataFrame(
    perm_importances.importances[sorted_importances_idx].T,
    columns=X.columns[sorted_importances_idx],
)

#Shapley
# These three lines that are required to create this and some other plots later take too long to run on this model and dataset, so I'll simply load the saved results from file
explainer = shap.TreeExplainer(xgb_model) 
shap_values = explainer.shap_values(X)
explainer_obj = explainer(X)

#Predictive Power Score (PPS)
import ppscore as pps 

pps_df = X_test.copy()
pps_df['price'] = y_test
pps_scores = pps.predictors(pps_df, 'price')


### Show all plots
plt.subplots(2,2,figsize=(10,5))

#MDI
plt.subplot(2,2,1)
sns.barplot(data=mdi_importances, x='Importance', y='Feature', color=sns.color_palette()[0]);
plt.xlabel('MDI Importance')

#Permutation
plt.subplot(2,2,2)
sns.boxplot(data=pd.melt(permutation_importances, var_name='Feature', value_name='Importance').sort_values(by='Importance', ascending=False), 
            color=sns.color_palette()[0],
            x='Importance', 
            y='Feature', 
            orient='h',
            linewidth=.5)
plt.axvline(x=0, color="k", linestyle="--", linewidth=.8)
plt.xlabel('Permutation Importance')

#Shapley
plt.subplot(2,2,3)
shap.summary_plot(shap_values, X, plot_type="bar", show=False, plot_size=None, color=sns.color_palette()[0])
plt.ylabel('Feature');
plt.xlabel('Mean SHAP value')

#PPS
plt.subplot(2,2,4)
sns.barplot(data=pps_scores, y="x", x="ppscore", color=sns.color_palette()[0]);
plt.ylabel('Feature');
plt.xlabel('PPS')

plt.show()


from xgboost import plot_tree

last_tree_index = xgb_model.get_booster().num_boosted_rounds() - 1

fig, axes = plt.subplots(1,1,figsize=(20, 100))
plot_tree(xgb_model, num_trees=last_tree_index, rankdir='LR', ax=axes);


from sklearn.inspection import PartialDependenceDisplay

top_features = ['year', 'odometer', 'cylinders']

top_feature_indices = [np.argwhere(X_test.columns.str.contains(feature))[0][0] for feature in top_features]

fig, axes = plt.subplots(1,1,figsize=(15,5))

PartialDependenceDisplay.from_estimator(
    xgb_model, 
    X_test.sample(n=100, random_state=0), # note that I'm only plotting a smaller sample to make the plots more readable
    top_feature_indices, 
    feature_names=X_test.columns, 
    ax=axes,
    kind='both',  # 'both' to display both PDP and ICE
    ice_lines_kw={'color': 'blue', 'alpha': 0.2},  # ICE plot customization
    pd_line_kw={'color': 'red', 'linestyle': '-', 'linewidth':1, 'label': 'PDP'}  # PDP line customization
);


fig, axes = plt.subplots(1,3,figsize = (15,5))

PartialDependenceDisplay.from_estimator(
    xgb_model, 
    X_test.sample(n=100, random_state=0), # note that I'm only plotting a smaller sample to make the plots more readable
    **{'features': ['odometer', 'year', ('odometer', 'year')], 'kind':'average'},
    ax=axes,
    pd_line_kw={'color': 'red', 'linestyle': '-', 'linewidth':1, 'label': 'PDP'},  # PDP line customization,
);


shap.summary_plot(shap_values, X, cmap='viridis')


selected_features = ['year', 'odometer', 'cylinders']

for feature in selected_features:
    shap.dependence_plot(feature, shap_values, X, cmap='viridis', alpha=.5)


car_indices = [2,8,outlier_index]
cars_df.iloc[car_indices]


shap.force_plot(explainer.expected_value, shap_values[car_indices[0], :], X.iloc[car_indices[0], :])


shap.force_plot(explainer.expected_value, shap_values[car_indices[1], :], X.iloc[car_indices[1], :])


shap.force_plot(explainer.expected_value, shap_values[car_indices[2], :], X.iloc[car_indices[2], :])


for i in car_indices:
    print(pd.DataFrame(cars_df.iloc[i]).T)
    shap.plots.waterfall(explainer_obj[i], max_display=8)

  price    year manufacturer      model condition cylinders fuel  odometer  \
2  8995  2008.0      lincoln  navigator       0.0       8.0  2.0  147887.0   

  title_status transmission drive type state model_count  
2            5    automatic   4wd  SUV    mi         236

   price    year manufacturer     model condition cylinders fuel  odometer  \
8  16300  2016.0    chevrolet  traverse       0.0       6.0  2.0  112657.0   

  title_status transmission drive type state model_count  
8            5    automatic   4wd  SUV    mi         801

     price    year manufacturer      model condition cylinders fuel odometer  \
3426  5999  2017.0    chevrolet  silverado       4.0       8.0  2.0  99999.0   

     title_status transmission drive    type state model_count  
3426            5    automatic   4wd  pickup    fl        2011


outlier_model = 'silverado'
silverados_sample = pd.concat([
    cars_df[cars_df['model']==outlier_model].sample(n=10, random_state=0), # Ten random Silverados
    pd.DataFrame(cars_df.iloc[outlier_index,:]).T # our outlier, at the bottom of the df
    ])
silverados_sample


car_indices2 = silverados_sample.index.tolist()

r = shap.decision_plot(explainer.expected_value, shap_values[car_indices2], X.iloc[car_indices2], highlight=car_indices2.index(outlier_index), return_objects=True)


selected_sample = outlier_index
shap.decision_plot(
    explainer.expected_value,
    shap_values[selected_sample],
    X.iloc[selected_sample],
    feature_order=r.feature_idx,
    xlim=r.xlim,
)


pd.DataFrame(cars_df.iloc[outlier_index,:]).T


sample_idx = X.sample(n=1000, random_state=0).index.tolist()

shap.plots.heatmap(
    explainer(X.loc[sample_idx]), 
    instance_order=explainer(X.loc[sample_idx]).sum(1)
    );

	region	price	year	manufacturer	model	condition	cylinders	fuel	odometer	title_status	transmission	drive	size	type	paint_color	county	state	lat	long	posting_date
0	prescott	6000	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	az	NaN	NaN	NaN
1	fayetteville	11900	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	ar	NaN	NaN	NaN
2	florida keys	21000	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	fl	NaN	NaN	NaN
3	worcester / central MA	1500	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	ma	NaN	NaN	NaN
4	greensboro	4900	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	nc	NaN	NaN	NaN

An Explainable XGBoost Approach to Predicting Car Prices¶

Overview¶

Quick EDA + Preprocessing¶

Import Libraries and Data¶

Initial Clean-up¶

Imputation + Basic Encoding¶

Check Distributions¶

Check Correlations¶

Import the Model and Predict With It¶

Evaluate the Model and Interpret the Outputs¶

Basic Performance Metrics¶

Global Feature Importance¶

Plot the tree!¶

Partial Dependence and Individual Conditional Expectation Plots¶

Local Feature Importance¶

Conclusion and Future Directions¶

	price	year	manufacturer	model	condition	cylinders	fuel	odometer	title_status	transmission	drive	type	state	model_count
0	33590	2014.0	gmc	sierra 1500 crew cab slt	2.0	8.0	2.0	57923.0	5	other	4wd	pickup	al	185
1	22590	2010.0	chevrolet	silverado 1500	2.0	8.0	2.0	71229.0	5	other	4wd	pickup	al	3574
2	30990	2017.0	toyota	tundra double cab sr	2.0	8.0	2.0	41124.0	5	other	4wd	pickup	al	328
3	15000	2013.0	ford	f-150 xlt	3.0	8.0	2.0	128000.0	5	automatic	4wd	pickup	al	325
4	27990	2012.0	gmc	sierra 2500 hd extended cab	2.0	8.0	2.0	68696.0	5	other	4wd	pickup	al	165

	price	year	manufacturer	model	condition	cylinders	fuel	odometer	title_status	transmission	drive	type	state	model_count
2	8995	2008.0	lincoln	navigator	0.0	8.0	2.0	147887.0	5	automatic	4wd	SUV	mi	236
8	16300	2016.0	chevrolet	traverse	0.0	6.0	2.0	112657.0	5	automatic	4wd	SUV	mi	801
3426	5999	2017.0	chevrolet	silverado	4.0	8.0	2.0	99999.0	5	automatic	4wd	pickup	fl	2011

	price	year	manufacturer	model	condition	cylinders	fuel	odometer	title_status	transmission	drive	type	state	model_count
16500	11000	2007.0	chevrolet	silverado	2.0	8.0	2.0	205000.0	5	automatic	4wd	pickup	il	2011
5828	13998	2011.0	chevrolet	silverado	0.0	8.0	2.0	150853.0	5	automatic	4wd	pickup	id	2011
13157	5300	1998.0	chevrolet	silverado	0.0	8.0	2.0	190000.0	5	automatic	4wd	pickup	ma	2011
6136	7000	2002.0	chevrolet	silverado	0.0	8.0	2.0	125000.0	5	automatic	4wd	pickup	pa	2011
10341	35999	2017.0	chevrolet	silverado	0.0	8.0	2.0	47815.0	5	automatic	4wd	pickup	tx	2011
5594	8500	1994.0	chevrolet	silverado	3.0	8.0	2.0	142262.0	5	automatic	4wd	pickup	tx	2011
939	31956	2017.0	chevrolet	silverado	3.0	8.0	2.0	91059.0	5	automatic	4wd	pickup	ky	2011
528	24995	2017.0	chevrolet	silverado	0.0	8.0	2.0	64250.0	5	automatic	4wd	pickup	dc	2011
8834	6111	2007.0	chevrolet	silverado	0.0	8.0	2.0	234336.0	5	automatic	4wd	pickup	fl	2011
14343	26500	1984.0	chevrolet	silverado	0.0	8.0	2.0	38751.0	5	automatic	4wd	pickup	sd	2011
3426	5999	2017.0	chevrolet	silverado	4.0	8.0	2.0	99999.0	5	automatic	4wd	pickup	fl	2011