import pandas as pd
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

sns.set_theme(
    context='talk',
    font_scale=0.9,
    palette = ['#0F00F5', '#3061FF', '#9AB1FF', '#CDD9FF', '#E6ECFF','#E5E5E5',
               '#B6BBCB', '#878B98','#696A6F','#292A2E'],
    style = {
        'axes.facecolor': '#FFFFFF',
         'axes.edgecolor': '#000000',
         'axes.legend.edgecolor':'#FFFFFF',
         'axes.grid': False,
         'axes.axisbelow': 'line',
         'axes.labelcolor': 'black',
         'figure.facecolor': '#FFFFFF',
         'grid.color': '#b0b0b0',
         'grid.linestyle': '-',
         'text.color': 'black',
         'xtick.color': 'black',
         'ytick.color': 'black',
         'xtick.direction': 'out',
         'ytick.direction': 'out',
         'patch.edgecolor': '#FFFFFF',
         'patch.force_edgecolor': True,
         'image.cmap': 'viridis',
         'font.family': ['sans-serif'],
         'font.sans-serif': 'Helvetica Neue',
         'xtick.bottom': False,
         'xtick.top': False,
         'ytick.left': False,
         'ytick.right': False,
         'axes.spines.left': False,
         'axes.spines.bottom': False,
         'axes.spines.right': False,
         'axes.spines.top': False
    }
)


columns = [
    "symboling", "normalized_losses", "make", "fuel_type", "aspiration",
    "num_doors", "body_style", "drive_wheels", "engine_location", "wheel_base",
    "length", "width", "height", "curb_weight", "engine_type", "num_cylinders",
    "engine_size", "fuel_system", "bore", "stroke", "compression_ratio",
    "horsepower", "peak_rpm", "city_mpg", "highway_mpg", "price"
]
df = pd.read_csv('imports-85.data', names=columns, na_values=np.nan)
df


num_cols = [
    'normalized_losses', 'wheel_base', 'length', 'width', 'height', 'curb_weight', 'engine_size', 'bore', 'stroke', 'compression_ratio', 'horsepower', 'peak_rpm', 
    'city_mpg', 'highway_mpg', 'price'
]

df_nums = df[num_cols].apply(pd.to_numeric, errors='coerce')

for col in num_cols:
    df_nums[col] = df_nums[col].fillna(df_nums[col].mean())

df_nums


scaler = StandardScaler()
X_scaled = scaler.fit_transform(df_nums)

pd.DataFrame(X_scaled, columns=df_nums.columns)


# Apply PCA
pca = PCA(n_components=6)
X_pca = pca.fit_transform(X_scaled)


# Variance explained by each component
explained_variance = pca.explained_variance_ratio_

explained_variance_df = pd.DataFrame({
    'Principal Component': [f'PC{i+1}' for i in range(pca.n_components_)],
    'Explained Variance': explained_variance,
    'Cum Variance': pd.Series(explained_variance).cumsum()
})

fig, axes = plt.subplots(1,2,figsize=(14,5))

sns.barplot(
    data=explained_variance_df,
    x='Principal Component',
    y='Explained Variance',
    ax=axes[0]
).set_title('Scree Plot:\nExplained Variance for Each PC\n\n')

sns.lineplot(
    data=explained_variance_df,
    x='Principal Component',
    y='Cum Variance',
    ax=axes[1],
    marker='o',
    size=3,
    markersize=5
).set_title('Cumulative Explained Variance\nwith Each New PC\n\n')

plt.legend([],[],frameon=False);

barlabels_1 = [f'{i*100:.1f}%' for i in explained_variance_df['Explained Variance']]
axes[0].bar_label(axes[0].containers[-1], label_type='edge', labels=barlabels_1, color='k', padding=5, rotation=0);


for x, y in zip(explained_variance_df['Principal Component'], explained_variance_df['Cum Variance']):
    if explained_variance_df['Principal Component'].tolist().index(x) == 0:
        axes[1].text(explained_variance_df['Principal Component'].tolist().index(x)+.1, y, f'{y*100:.1f}%')
    elif explained_variance_df['Principal Component'].tolist().index(x) <= 3:
        axes[1].text(explained_variance_df['Principal Component'].tolist().index(x), y-.025, f'{y*100:.1f}%')
    else:
        axes[1].text(explained_variance_df['Principal Component'].tolist().index(x)-.8, y, f'{y*100:.1f}%')


axes[0].set_xlabel('')
axes[1].set_xlabel('')
axes[0].set_ylabel('')
axes[1].set_ylabel('')

axes[0].set_yticklabels('');
axes[1].set_yticklabels('');


eigenvalues = pca.explained_variance_

pd.DataFrame({
    'Component': [f'PC{i+1}' for i in range(pca.n_components_)],
    'Eigenvalues': eigenvalues,
    'Explained Variance': explained_variance
})


eigenvectors = pca.components_
loading_scores = pd.DataFrame(eigenvectors.T, columns=[f"PC{i+1}" for i in range(eigenvectors.shape[0])], index=num_cols)

fig, axes = plt.subplots(1,1,figsize=(8,10))
sns.heatmap(
    loading_scores, 
    annot=round(loading_scores,2), 
    center=0,
    cmap='vlag',
    ax=axes
).set_title('Heatmap of Loading Scores\n');


X_pca_df = pd.DataFrame(X_pca, columns=[f'PC{i+1}' for i in range(pca.n_components_)])
X_pca_df_joined = X_pca_df.join(df, how='left')

fig, axes = plt.subplots(1,1,figsize=(10,7))

pc_x, pc_y = (0,1)
to_plot = X_pca_df_joined
cols = X_pca_df_joined.columns

sns.scatterplot(data=to_plot, x=cols[pc_x], y=cols[pc_y], ax=axes, s=15, alpha=.5, hue='fuel_type', palette='bright').set_title(f'Scatterplot of PC{pc_x+1} and PC{pc_y+1}')
axes.grid(True, color='#F2F2F2', zorder=0.5, linewidth=.8)

plt.xlabel(f"PC{pc_x + 1} (~{pca.explained_variance_ratio_[pc_x]*100:.0f}% Variance)")
plt.ylabel(f"PC{pc_y + 1} (~{pca.explained_variance_ratio_[pc_y]*100:.0f}% Variance)")

for x, y, make in zip(X_pca_df_joined['PC1'], X_pca_df_joined['PC2'], X_pca_df_joined['make']):
    if abs(x) > 4:
        plt.text(x+.1, y+.1, make, size=10)

plt.legend(bbox_to_anchor=(1.1,1.1)).get_frame().set_linewidth(0)


def biplot(scores, loadings, labels, components=(0, 1), selected_labels=num_cols, feature_size=15, hue=None):
    """
    Creates a biplot for PCA.
    Args:
        scores: The PCA transformed data, in pandas df format.
        loadings: The loading scores (eigenvectors).
        labels: Feature labels.
        components: Tuple for the two principal components to plot.
        selected_labels: specific variables to include, defaults to all variables.
    """
    pc_x, pc_y = components
    cols=scores.columns

    fig, axes = plt.subplots(1,1,figsize=(10,7))

    # Scatter plot of the scores
    #sns.scatterplot(scores[:, pc_x], scores[:, pc_y], ax=axes, s=feature_size, alpha=.5).set_title(f'Biplot of PC{pc_x+1} and PC{pc_y+1}')
    sns.scatterplot(data=scores, x=cols[pc_x], y=cols[pc_y], ax=axes, s=feature_size, alpha=.5, hue=hue, palette='bright').set_title(f'Biplot of PC{pc_x+1} and PC{pc_y+1}')
    axes.grid(True, color='#F2F2F2', zorder=0.5, linewidth=.8)
    plt.xlabel(f"PC{pc_x + 1} (~{pca.explained_variance_ratio_[pc_x]*100:.0f}% Variance)")
    plt.ylabel(f"PC{pc_y + 1} (~{pca.explained_variance_ratio_[pc_y]*100:.0f}% Variance)")
    
    if hue:
        plt.legend(bbox_to_anchor=(1.1,1.1)).get_frame().set_linewidth(0)

    # Add the loading vectors
    for i, label in enumerate(labels):
        if label in selected_labels:
            plt.arrow(0, 0, 
                        loadings[pc_x, i] * max(scores.iloc[:, pc_x]), 
                        loadings[pc_y, i] * max(scores.iloc[:, pc_y]), 
                        color='red', alpha=0.5, head_width=0.1)
            plt.text(loadings[pc_x, i] * max(scores.iloc[:, pc_x]) * 1.7, 
                        loadings[pc_y, i] * max(scores.iloc[:, pc_y]) * 1.2, 
                        label, color='red', ha='center', va='center', fontsize=15)

    plt.show()

# Call biplot
biplot(
    scores=X_pca_df_joined, 
    loadings=eigenvectors, 
    labels=num_cols, 
    components=(0,1)
    )


pc1_2_cols = ['length','width','curb_weight','engine_size','horsepower','city_mpg','highway_mpg','price']

biplot(
    scores=X_pca_df_joined, 
    loadings=eigenvectors, 
    labels=num_cols, 
    components=(0,1),
    selected_labels=pc1_2_cols
    )


biplot(
    scores=X_pca_df_joined, 
    loadings=eigenvectors, 
    labels=num_cols, 
    components=(0,1),
    selected_labels=pc1_2_cols,
    feature_size=df_nums['horsepower']**1.1
    )


biplot(
    scores=X_pca_df_joined, 
    loadings=eigenvectors, 
    labels=num_cols, 
    components=(0,1),
    selected_labels=pc1_2_cols,
    feature_size=df_nums['city_mpg']**1.4
    )

	symboling	normalized_losses	make	fuel_type	aspiration	num_doors	body_style	drive_wheels	engine_location	wheel_base	...	engine_size	fuel_system	bore	stroke	compression_ratio	horsepower	peak_rpm	city_mpg	highway_mpg	price
0	3	?	alfa-romero	gas	std	two	convertible	rwd	front	88.6	...	130	mpfi	3.47	2.68	9.0	111	5000	21	27	13495
1	3	?	alfa-romero	gas	std	two	convertible	rwd	front	88.6	...	130	mpfi	3.47	2.68	9.0	111	5000	21	27	16500
2	1	?	alfa-romero	gas	std	two	hatchback	rwd	front	94.5	...	152	mpfi	2.68	3.47	9.0	154	5000	19	26	16500
3	2	164	audi	gas	std	four	sedan	fwd	front	99.8	...	109	mpfi	3.19	3.40	10.0	102	5500	24	30	13950
4	2	164	audi	gas	std	four	sedan	4wd	front	99.4	...	136	mpfi	3.19	3.40	8.0	115	5500	18	22	17450
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
200	-1	95	volvo	gas	std	four	sedan	rwd	front	109.1	...	141	mpfi	3.78	3.15	9.5	114	5400	23	28	16845
201	-1	95	volvo	gas	turbo	four	sedan	rwd	front	109.1	...	141	mpfi	3.78	3.15	8.7	160	5300	19	25	19045
202	-1	95	volvo	gas	std	four	sedan	rwd	front	109.1	...	173	mpfi	3.58	2.87	8.8	134	5500	18	23	21485
203	-1	95	volvo	diesel	turbo	four	sedan	rwd	front	109.1	...	145	idi	3.01	3.40	23.0	106	4800	26	27	22470
204	-1	95	volvo	gas	turbo	four	sedan	rwd	front	109.1	...	141	mpfi	3.78	3.15	9.5	114	5400	19	25	22625

	normalized_losses	wheel_base	length	width	height	curb_weight	engine_size	bore	stroke	compression_ratio	horsepower	peak_rpm	city_mpg	highway_mpg	price
0	122.0	88.6	168.8	64.1	48.8	2548	130	3.47	2.68	9.0	111.0	5000.0	21	27	13495.0
1	122.0	88.6	168.8	64.1	48.8	2548	130	3.47	2.68	9.0	111.0	5000.0	21	27	16500.0
2	122.0	94.5	171.2	65.5	52.4	2823	152	2.68	3.47	9.0	154.0	5000.0	19	26	16500.0
3	164.0	99.8	176.6	66.2	54.3	2337	109	3.19	3.40	10.0	102.0	5500.0	24	30	13950.0
4	164.0	99.4	176.6	66.4	54.3	2824	136	3.19	3.40	8.0	115.0	5500.0	18	22	17450.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
200	95.0	109.1	188.8	68.9	55.5	2952	141	3.78	3.15	9.5	114.0	5400.0	23	28	16845.0
201	95.0	109.1	188.8	68.8	55.5	3049	141	3.78	3.15	8.7	160.0	5300.0	19	25	19045.0
202	95.0	109.1	188.8	68.9	55.5	3012	173	3.58	2.87	8.8	134.0	5500.0	18	23	21485.0
203	95.0	109.1	188.8	68.9	55.5	3217	145	3.01	3.40	23.0	106.0	4800.0	26	27	22470.0
204	95.0	109.1	188.8	68.9	55.5	3062	141	3.78	3.15	9.5	114.0	5400.0	19	25	22625.0

	normalized_losses	wheel_base	length	width	height	curb_weight	engine_size	bore	stroke	compression_ratio	horsepower	peak_rpm	city_mpg	highway_mpg	price
0	0.000000	-1.690772	-0.426521	-0.844782	-2.020417	-0.014566	0.074449	0.519089	-1.839404	-0.288349	0.171065	-0.263484	-0.646553	-0.546059	0.036674
1	0.000000	-1.690772	-0.426521	-0.844782	-2.020417	-0.014566	0.074449	0.519089	-1.839404	-0.288349	0.171065	-0.263484	-0.646553	-0.546059	0.419498
2	0.000000	-0.708596	-0.231513	-0.190566	-0.543527	0.514882	0.604046	-2.404862	0.685920	-0.288349	1.261807	-0.263484	-0.953012	-0.691627	0.419498
3	1.328961	0.173698	0.207256	0.136542	0.235942	-0.420797	-0.431076	-0.517248	0.462157	-0.035973	-0.057230	0.787346	-0.186865	-0.109354	0.094639
4	1.328961	0.107110	0.207256	0.230001	0.235942	0.516807	0.218885	-0.517248	0.462157	-0.540725	0.272529	0.787346	-1.106241	-1.273900	0.540524
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
200	-0.854332	1.721873	1.198549	1.398245	0.728239	0.763241	0.339248	1.666463	-0.336996	-0.162161	0.247163	0.577180	-0.340094	-0.400490	0.463449
201	-0.854332	1.721873	1.198549	1.351515	0.728239	0.949992	0.339248	1.666463	-0.336996	-0.364062	1.414003	0.367014	-0.953012	-0.837195	0.743720
202	-0.854332	1.721873	1.198549	1.398245	0.728239	0.878757	1.109571	0.926222	-1.232047	-0.338824	0.754485	0.787346	-1.106241	-1.128332	1.054566
203	-0.854332	1.721873	1.198549	1.398245	0.728239	1.273437	0.435538	-1.183465	0.462157	3.244916	0.044234	-0.683816	0.119594	-0.546059	1.180051
204	-0.854332	1.721873	1.198549	1.398245	0.728239	0.975021	0.339248	1.666463	-0.336996	-0.162161	0.247163	0.577180	-0.953012	-0.837195	1.199797

	Component	Eigenvalues	Explained Variance
0	PC1	7.427482	0.492750
1	PC2	2.505051	0.166189
2	PC3	1.282823	0.085104
3	PC4	0.891585	0.059149
4	PC5	0.814024	0.054004
5	PC6	0.590178	0.039153

Principal Component Analysis for the Non-Technical Stakeholder¶

Introduction and Selection of Data¶

Applying PCA and Intepreting the Transformed Data¶

Eigenvalues, Eigenvectors, and Loading Scores¶

PC Scatterplot and Biplot¶