import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


# Load the data
pay_data = pd.read_csv('paysim_data.csv')


pay_data['nameOrig'].value_counts()

C1902386530    3
C363736674     3
C545315117     3
C724452879     3
C1784010646    3
              ..
C98968405      1
C720209255     1
C1567523029    1
C644777639     1
C1280323807    1
Name: nameOrig, Length: 6353307, dtype: int64


pay_data = pay_data.drop_duplicates(subset='nameOrig')


pay_data[pay_data['amount'] == 0]


pay_data = pay_data[(pay_data['amount'] > 0)]


#Explore the data
pay_data.head(15)


print('\nLength of Dataset: ', len(pay_data))
print('\nColumns: ', pay_data.columns)

Length of Dataset:  6353291

Columns:  Index(['step', 'type', 'amount', 'nameOrig', 'oldbalanceOrg', 'newbalanceOrig',
       'nameDest', 'oldbalanceDest', 'newbalanceDest', 'isFraud',
       'isFlaggedFraud'],
      dtype='object')


#Since Pandas does not support the easier mutable operations, we'll change it the hard way:
colnames = pay_data.columns.tolist()
colnames[colnames.index('oldbalanceOrg')] = 'oldbalanceOrig'
pay_data.columns = colnames
pay_data.columns

Index(['step', 'type', 'amount', 'nameOrig', 'oldbalanceOrig',
       'newbalanceOrig', 'nameDest', 'oldbalanceDest', 'newbalanceDest',
       'isFraud', 'isFlaggedFraud'],
      dtype='object')


pay_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6353291 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrig  float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 581.7+ MB


pay_data.isna().sum()

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrig    0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64


pay_data.describe()


fig, axs = plt.subplots(2,5, figsize=(20,10))

axs[0,0].hist(pay_data['amount'], bins=50, color = 'tab:blue')
axs[0,0].set_xscale('log')
axs[0,0].set_ylabel('Frequency', fontsize=14)
axs[0,0].set_title('amount', fontsize=14)
axs[1,0].boxplot(pay_data['amount'])
axs[1,0].set_ylabel('IQR + Outliers', fontsize=14)

axs[0,1].hist(pay_data['oldbalanceOrig'], bins=50, color = 'tab:orange')
axs[0,1].set_title('oldbalanceOrig', fontsize=14)
axs[1,1].boxplot(pay_data['oldbalanceOrig'])

axs[0,2].hist(pay_data['newbalanceOrig'], bins=50, color = 'tab:green')
axs[0,2].set_xscale('log')
axs[0,2].set_title('newbalanceOrig', fontsize=14)
axs[1,2].boxplot(pay_data['newbalanceOrig'])

axs[0,3].hist(pay_data['oldbalanceDest'], bins=50, color = 'tab:red')
axs[0,3].set_xscale('log')
axs[0,3].set_title('oldbalanceDest', fontsize=14)
axs[1,3].boxplot(pay_data['oldbalanceDest'])

axs[0,4].hist(pay_data['newbalanceDest'], bins=50, color = 'tab:purple')
axs[0,4].set_title('newbalanceDest', fontsize=14)
axs[1,4].boxplot(pay_data['newbalanceDest'])

plt.show()


n_merchants_nameDest = len(pay_data['nameDest'][pay_data['nameDest'].str.contains('M')])
n_merchants_nameOrig = len(pay_data['nameOrig'][pay_data['nameOrig'].str.contains('M')])
perc_merchants_nameDest = round((n_merchants_nameDest/len(pay_data))*100,2)
perc_merchants_nameOrig = round((n_merchants_nameOrig/len(pay_data))*100,2)

print('Number of transaction where destination is a merchant: ', n_merchants_nameDest)
print('Number of transaction where origin is a merchant: ', n_merchants_nameOrig)
print('Percentage of transaction where destination is a merchant: ', perc_merchants_nameDest,'%')
print('Percentage of transaction where origin is a merchant: ', perc_merchants_nameOrig,'%')

Number of transaction where destination is a merchant:  2148333
Number of transaction where origin is a merchant:  0
Percentage of transaction where destination is a merchant:  33.81 %
Percentage of transaction where origin is a merchant:  0.0 %


n_customers_nameDest = len(pay_data['nameDest'][pay_data['nameDest'].str.contains('C')])
n_customers_nameOrig = len(pay_data['nameOrig'][pay_data['nameOrig'].str.contains('C')])
perc_customers_nameDest = round((n_customers_nameDest/len(pay_data))*100,2)
perc_customers_nameOrig = round((n_customers_nameOrig/len(pay_data))*100,2)

print('Number of transaction where destination is a customer: ', n_customers_nameDest)
print('Number of transaction where origin is a customer: ', n_customers_nameOrig)
print('Percentage of transaction where destination is a customer: ', perc_customers_nameDest,'%')
print('Percentage of transaction where origin is a customer: ', perc_customers_nameOrig,'%')

Number of transaction where destination is a customer:  4204958
Number of transaction where origin is a customer:  6353291
Percentage of transaction where destination is a customer:  66.19 %
Percentage of transaction where origin is a customer:  100.0 %


customer_customer = pay_data[pay_data['nameOrig'].str.contains('C') & pay_data['nameDest'].str.contains('C')]
customer_customer.head()


customer_customer['type'].unique()

array(['TRANSFER', 'CASH_OUT', 'DEBIT', 'CASH_IN'], dtype=object)


customer_customer.describe()


customer_merchant = pay_data[pay_data['nameOrig'].str.contains('C') & pay_data['nameDest'].str.contains('M')]
customer_merchant.head()


customer_merchant['type'].unique()

array(['PAYMENT'], dtype=object)


customer_merchant.describe()


fig, axes = plt.subplots(1,2, figsize=(20, 5))

sns.barplot(x='type', y='amount', data=pay_data, ax=axes[0], hue='isFraud').set_title('Mean Amount Transferred (in Millions),\nby Movement Type', fontsize=16, weight='bold');
sns.histplot(x='amount', data=pay_data, ax=axes[1], hue='isFraud', stat='density', element='step', common_norm=False, bins=20, log_scale=True).set_title('Density Distribution for Amount Transferred,\nby Fraud Type', fontsize=16, weight='bold');


frauds = pay_data[(pay_data['isFraud'] == 1)]
non_frauds = pay_data[(pay_data['isFraud'] == 0)]

print(
    'Total number of fraud transactions: ', + len(frauds), '\n'
    'Total number of transactions: ', len(pay_data), '\n'
    'Percentage of fraud transactions in the dataset: ', round((len(frauds)/len(pay_data))*100,3), '%'
)

Total number of fraud transactions:  8179 
Total number of transactions:  6353291 
Percentage of fraud transactions in the dataset:  0.129 %


ig, axes = plt.subplots(1,2, figsize=(20, 5))

sns.histplot(x='amount', data=non_frauds, ax=axes[0], bins=20, log_scale=True).set_title('Distribution for `amount` in Non-Frauds Data', fontsize=16, weight='bold');
sns.histplot(x='amount', data=frauds, ax=axes[1], bins=20, log_scale=True).set_title('Distribution for `amount` in Frauds Data', fontsize=16, weight='bold');


frauds['type'].unique()

array(['TRANSFER', 'CASH_OUT'], dtype=object)


from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

X = pay_data[['type', 'amount', 'oldbalanceOrig', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']]
X_enc = X.copy()
X_enc['type'] = le.fit_transform(X['type'])

y = pay_data['isFraud']


from sklearn.feature_selection import mutual_info_classif

results = pd.DataFrame({'Feature': X_enc.columns, 'Mutual Info with Target': mutual_info_classif(X_enc,y,random_state=0)})
results.sort_values('Mutual Info with Target', ascending=False)


X = pay_data[['type', 'amount', 'oldbalanceOrig']]
X['type'] = le.fit_transform(X['type'])
y = pay_data['isFraud']


X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, test_size = 0.3, random_state=0)


#   Considering the differences in the scales of the different variables, it is better to transform them 
#   to prevent them from having too much weight on the model.
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


lr = LogisticRegression()
lr.fit(X_train, y_train);


print(
    'Train score:\t', lr.score(X_train, y_train),
    '\nTest score:\t', lr.score(X_test, y_test)
)

Train score:	 0.9986942648162268 
Test score:	 0.9986967389091642


y_predict = lr.predict(X_test)


from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_predict)

array([[1903501,      42],
       [   2442,       3]])


from sklearn.metrics import recall_score, precision_score, f1_score
print(
    'Recall Score:\t', recall_score(y_test, y_predict), '\n'
    'Precision Score:\t', precision_score(y_test, y_predict), '\n'
    'F1 Score:\t', f1_score(y_test, y_predict)
)

Recall Score:	 0.001226993865030675 
Precision Score:	 0.06666666666666667 
F1 Score:	 0.0024096385542168677


# First we'll determine the samples and what proportions of the dataset they'll represent when a new dataset is created
# by combining the sample with all the frauds data 

proportions = [i*(0.1*i) for i in range(1, 10)]
lengths = [i * len(frauds) for i in proportions]
lengths = [int(i) for i in lengths]

for i in lengths:
    print(f'Sample of {i} from non-fraud data equals {round((i*100/(i + len(frauds))),2)}% of the total dataset when the sample and the frauds data are combined')

Sample of 817 from non-fraud data equals 9.08% of the total dataset when the sample and the frauds data are combined
Sample of 3271 from non-fraud data equals 28.57% of the total dataset when the sample and the frauds data are combined
Sample of 7361 from non-fraud data equals 47.37% of the total dataset when the sample and the frauds data are combined
Sample of 13086 from non-fraud data equals 61.54% of the total dataset when the sample and the frauds data are combined
Sample of 20447 from non-fraud data equals 71.43% of the total dataset when the sample and the frauds data are combined
Sample of 29444 from non-fraud data equals 78.26% of the total dataset when the sample and the frauds data are combined
Sample of 40077 from non-fraud data equals 83.05% of the total dataset when the sample and the frauds data are combined
Sample of 52345 from non-fraud data equals 86.49% of the total dataset when the sample and the frauds data are combined
Sample of 66249 from non-fraud data equals 89.01% of the total dataset when the sample and the frauds data are combined


for i in lengths:
    #Create the dataset combining the sample of different sizes with the frauds data:
    non_fraud_sample = non_frauds.sample(n = i, random_state=100)
    df = pd.concat([non_fraud_sample, frauds])

    #Repeat the feature and label selection, data splitting, training and evaluation of the model:
    pre_X_df = df[['type', 'amount', 'oldbalanceOrig']]
    X_df = pre_X_df.copy()
    X_df['type'] = le.fit_transform(pre_X_df['type'])
    y_df = df['isFraud']
    
    X_train_df, X_test_df, y_train_df, y_test_df = train_test_split(X_df, y_df, train_size = 0.7, test_size = 0.3, random_state=100)

    X_train_df = scaler.fit_transform(X_train_df)
    X_test_df = scaler.transform(X_test_df)

    lr.fit(X_train_df, y_train_df)#.values.ravel())

    #And we will use the model to make predictions on the real-world data (the original split `X_test` from the dataset)
    #so we can compare straight away how the model performs and what was the best proportion of labels to work with
    y_predict = lr.predict(X_test)

    #Print the confusion matrices and relevant scores 
    print(
        'For sample size of', i, ':', '\n'
        'Confusion matrix:' '\n', confusion_matrix(y_test, y_predict), '\n'
        'Recall Score: ', recall_score(y_test, y_predict), '\n'
        'Precision Score: ', precision_score(y_test, y_predict), '\n'
        'F1 Score: ', f1_score(y_test, y_predict), '\n'
        '\n', '---------------------', '\n'
    )

For sample size of 817 : 
Confusion matrix:
 [[  63658 1839885]
 [      0    2445]] 
Recall Score:  1.0 
Precision Score:  0.0013271238051814822 
F1 Score:  0.0026507297637923324 

 --------------------- 

For sample size of 3271 : 
Confusion matrix:
 [[  91661 1811882]
 [      0    2445]] 
Recall Score:  1.0 
Precision Score:  0.0013476071292550902 
F1 Score:  0.0026915870566036905 

 --------------------- 

For sample size of 7361 : 
Confusion matrix:
 [[ 321874 1581669]
 [    147    2298]] 
Recall Score:  0.939877300613497 
Precision Score:  0.0014507878005034197 
F1 Score:  0.0028971036527711594 

 --------------------- 

For sample size of 13086 : 
Confusion matrix:
 [[1469564  433979]
 [    458    1987]] 
Recall Score:  0.812678936605317 
Precision Score:  0.004557694866113413 
F1 Score:  0.009064553581000476 

 --------------------- 

For sample size of 20447 : 
Confusion matrix:
 [[1721103  182440]
 [    928    1517]] 
Recall Score:  0.6204498977505113 
Precision Score:  0.008246492386807787 
F1 Score:  0.016276649392173905 

 --------------------- 

For sample size of 29444 : 
Confusion matrix:
 [[1797528  106015]
 [   1226    1219]] 
Recall Score:  0.4985685071574642 
Precision Score:  0.011367663241136207 
F1 Score:  0.022228503177454208 

 --------------------- 

For sample size of 40077 : 
Confusion matrix:
 [[1827144   76399]
 [   1444    1001]] 
Recall Score:  0.40940695296523516 
Precision Score:  0.0129328165374677 
F1 Score:  0.025073580061368905 

 --------------------- 

For sample size of 52345 : 
Confusion matrix:
 [[1848547   54996]
 [   1589     856]] 
Recall Score:  0.35010224948875257 
Precision Score:  0.015326219293848026 
F1 Score:  0.02936686278882275 

 --------------------- 

For sample size of 66249 : 
Confusion matrix:
 [[1864295   39248]
 [   1711     734]] 
Recall Score:  0.3002044989775051 
Precision Score:  0.018358261217547897 
F1 Score:  0.034600608103330426 

 ---------------------


non_fraud_sample = non_frauds.sample(n = len(frauds), random_state=100)
pay_df = pd.concat([non_fraud_sample, frauds])


for col in pay_df[['type', 'amount', 'oldbalanceOrig']].columns:
    print(col,'\n', pay_df[col].value_counts(), '\n')

type 
 CASH_OUT    7012
TRANSFER    4747
PAYMENT     2711
CASH_IN     1830
DEBIT         58
Name: type, dtype: int64 

amount 
 10000000.00    293
1165187.89       4
429257.45        4
362631.05        3
142791.28        3
              ... 
25714.69         1
226229.51        1
121636.35        1
64428.22         1
150155.42        1
Name: amount, Length: 12146, dtype: int64 

oldbalanceOrig 
 0.00           2748
10000000.00     142
164.00            5
1165187.89        4
429257.45         4
               ... 
369.00            1
93659.00          1
61143.00          1
52267.00          1
86243.00          1
Name: oldbalanceOrig, Length: 9373, dtype: int64


pre_X_df = pay_df[['type', 'amount', 'oldbalanceOrig']]
X_df = pre_X_df.copy()
X_df['type'] = le.fit_transform(pre_X_df['type'])
y_df = pay_df['isFraud']
    
X_train_df, X_test_df, y_train_df, y_test_df = train_test_split(X_df, y_df, train_size = 0.7, test_size = 0.3, random_state=100)

X_train_df = scaler.fit_transform(X_train_df)
X_test_df = scaler.transform(X_test_df)

lr.fit(X_train_df, y_train_df);


from statsmodels.stats.outliers_influence import variance_inflation_factor
vif_data = pd.DataFrame({'Feature': X_df.columns, 'VIF': [variance_inflation_factor(X_df.values, i) for i in range(len(X_df.columns))]})
vif_data


pay_df = pay_df[(pay_df['oldbalanceOrig'] > 0)].reset_index()
pre_X_df = pay_df[['type', 'amount', 'oldbalanceOrig']]
X_df = pre_X_df.copy()
X_df['type'] = le.fit_transform(pre_X_df['type'])
y_df = pay_df['isFraud']


X_df['log_amount'] = X_df['amount'] * np.log(X_df['amount'])
X_df['log_oldbalanceOrig'] = X_df['oldbalanceOrig'] * np.log(X_df['oldbalanceOrig'])


X_lt = sm.tools.tools.add_constant(X_df, prepend=False)


logit_results = sm.GLM(y_df, X_lt, family=sm.families.Binomial()).fit()
logit_results.summary()


predicted = logit_results.predict(X_lt)

fig, axes = plt.subplots(1,2, figsize=(15,5))
sns.scatterplot(x= X_lt['amount'].values, y=predicted, ax = axes[0], alpha=0.4).set_title('`Amount` vs Log-odds (isFraud)', fontsize=12, weight='bold')
axes[0].set_xlabel('`Amount` (M)')
axes[0].set_ylabel('Log-odds')

sns.scatterplot(x= X_lt['oldbalanceOrig'].values, y=predicted, ax = axes[1], alpha=0.4).set_title('`oldbalanceOrig` vs Log-odds (isFraud)', fontsize=12, weight='bold')
axes[1].set_xlabel('`oldbalanceOrig`')
axes[1].set_ylabel('Log-odds');


influence = logit_results.get_influence(observed=True)


fig,ax = plt.subplots(figsize=(15,5))
influence.plot_index(ax = ax)
plt.plot(range(len(X_lt)), [4/len(X_lt)]*len(X_lt), c='r', alpha=0.8) #4/len(df) is a standard threshold for influence
fig.tight_layout()


fig, ax = plt.subplots(figsize=(15,5))
influence.plot_influence(ax = ax)
fig.tight_layout()


influence_df = influence.summary_frame()
influence_df['cooks_d'].sort_values(ascending=False)[:10]

10213    0.048803
4752     0.040594
1749     0.040593
2779     0.040592
558      0.040588
13445    0.033482
1907     0.014154
5107     0.010009
12534    0.009133
1818     0.008870
Name: cooks_d, dtype: float64


influence_df['hat_diag'].sort_values(ascending=False)[:10]

10213    0.184137
13445    0.160522
12534    0.091825
10215    0.084773
13447    0.066821
13567    0.055505
9716     0.050621
12207    0.037901
12536    0.028705
10217    0.025763
Name: hat_diag, dtype: float64


influence_df['standard_resid'].sort_values()[:10]

1907   -73.925830
5107   -70.436013
1818   -68.177359
4441   -55.446559
5071   -47.438348
1749   -42.466870
4752   -42.269426
2779   -42.185536
558    -42.070360
424    -35.670637
Name: standard_resid, dtype: float64


from sklearn.svm import SVC

svm = SVC(kernel='rbf', C= 1, gamma=0.1, random_state=0) #C = 1 is default, and gamma is just arbitrary for now
svm.fit(X_train_df, y_train_df)

y_predict_df = svm.predict(X_test_df)

print(
    'Training score:\t', svm.score(X_train_df, y_train_df),
    '\nTest score:\t', svm.score(X_test_df, y_test_df)
)

Training score:	 0.843056768558952 
Test score:	 0.8378158109209454


print(
    'Confusion matrix:' '\n', confusion_matrix(y_test_df, y_predict_df),
    '\nRecall Score\t:', recall_score(y_test_df, y_predict_df),
    '\nPrecision Score\t:', precision_score(y_test_df, y_predict_df),
    '\nF1 Score:\t', f1_score(y_test_df, y_predict_df),
)

Confusion matrix:
 [[2315  139]
 [ 657 1797]] 
Recall Score	: 0.7322738386308069 
Precision Score	: 0.9282024793388429 
F1 Score:	 0.8186788154897495


from sklearn.model_selection import GridSearchCV


C_range = np.logspace(-10,10,5)
gamma_range = np.logspace(-10, 10, 5)
param_grid = dict(gamma=gamma_range, C=C_range)


cv_search = GridSearchCV(svm, param_grid=param_grid, scoring='f1') #we'll use F1 since it balances precision and recall as seen earlier
cv_search.fit(X_train_df, y_train_df);


search_results = pd.DataFrame(cv_search.cv_results_)
search_results.sort_values(by='rank_test_score').head()


print("The best parameters are %s with a score of %0.4f" % (cv_search.best_params_, cv_search.best_score_))

The best parameters are {'C': 10000000000.0, 'gamma': 1.0} with a score of 0.9718


svm_final = SVC(kernel='rbf', C= 10000000000, gamma=1.0, random_state=0)
svm_final.fit(X_train_df, y_train_df);


y_predict_df = svm_final.predict(X_test_df)

print(
    'Confusion matrix:' '\n', confusion_matrix(y_test_df, y_predict_df),
    '\nRecall Score\t:', recall_score(y_test_df, y_predict_df),
    '\nPrecision Score\t:', precision_score(y_test_df, y_predict_df),
    '\nF1 Score:\t', f1_score(y_test_df, y_predict_df),
)

Confusion matrix:
 [[2337  117]
 [  19 2435]] 
Recall Score	: 0.9922575387123065 
Precision Score	: 0.954153605015674 
F1 Score:	 0.9728326008789453

	step	type	amount	nameOrig	oldbalanceOrg	newbalanceOrig	nameDest	oldbalanceDest	newbalanceDest	isFraud
0	1	PAYMENT	9839.64	C1231006815	170136.00	160296.36	M1979787155	0.0	0.00	0
1	1	PAYMENT	1864.28	C1666544295	21249.00	19384.72	M2044282225	0.0	0.00	0
2	1	TRANSFER	181.00	C1305486145	181.00	0.00	C553264065	0.0	0.00	1
3	1	CASH_OUT	181.00	C840083671	181.00	0.00	C38997010	21182.0	0.00	1
4	1	PAYMENT	11668.14	C2048537720	41554.00	29885.86	M1230701703	0.0	0.00	0
5	1	PAYMENT	7817.71	C90045638	53860.00	46042.29	M573487274	0.0	0.00	0
6	1	PAYMENT	7107.77	C154988899	183195.00	176087.23	M408069119	0.0	0.00	0
7	1	PAYMENT	7861.64	C1912850431	176087.23	168225.59	M633326333	0.0	0.00	0
8	1	PAYMENT	4024.36	C1265012928	2671.00	0.00	M1176932104	0.0	0.00	0
9	1	DEBIT	5337.77	C712410124	41720.00	36382.23	C195600860	41898.0	40348.79	0
10	1	DEBIT	9644.94	C1900366749	4465.00	0.00	C997608398	10845.0	157982.12	0
11	1	PAYMENT	3099.97	C249177573	20771.00	17671.03	M2096539129	0.0	0.00	0
12	1	PAYMENT	2560.74	C1648232591	5070.00	2509.26	M972865270	0.0	0.00	0
13	1	PAYMENT	11633.76	C1716932897	10127.00	0.00	M801569151	0.0	0.00	0
14	1	PAYMENT	4098.78	C1026483832	503264.00	499165.22	M1635378213	0.0	0.00	0

	step	amount	oldbalanceOrig	newbalanceOrig	oldbalanceDest	newbalanceDest	isFraud	isFlaggedFraud
count	6.353291e+06	6.353291e+06	6.353291e+06	6.353291e+06	6.353291e+06	6.353291e+06	6.353291e+06	6.353291e+06
mean	2.432836e+02	1.798499e+05	8.339021e+05	8.551329e+05	1.100557e+06	1.224843e+06	1.287364e-03	2.518380e-06
std	1.423257e+02	6.038027e+05	2.888229e+06	2.924031e+06	3.398649e+06	3.673608e+06	3.585676e-02	1.586939e-03
min	1.000000e+00	1.000000e-02	0.000000e+00	0.000000e+00	0.000000e+00	0.000000e+00	0.000000e+00	0.000000e+00
25%	1.550000e+02	1.338926e+04	0.000000e+00	0.000000e+00	0.000000e+00	0.000000e+00	0.000000e+00	0.000000e+00
50%	2.390000e+02	7.487491e+04	1.421000e+04	0.000000e+00	1.327156e+05	2.146720e+05	0.000000e+00	0.000000e+00
75%	3.340000e+02	2.087207e+05	1.073220e+05	1.442728e+05	9.429916e+05	1.111917e+06	0.000000e+00	0.000000e+00
max	7.430000e+02	9.244552e+07	5.958504e+07	4.958504e+07	3.560159e+08	3.561793e+08	1.000000e+00	1.000000e+00

	step	type	amount	nameOrig	oldbalanceOrig	newbalanceOrig	nameDest	oldbalanceDest	newbalanceDest	isFraud
2	1	TRANSFER	181.00	C1305486145	181.0	0.00	C553264065	0.0	0.00	1
3	1	CASH_OUT	181.00	C840083671	181.0	0.00	C38997010	21182.0	0.00	1
9	1	DEBIT	5337.77	C712410124	41720.0	36382.23	C195600860	41898.0	40348.79	0
10	1	DEBIT	9644.94	C1900366749	4465.0	0.00	C997608398	10845.0	157982.12	0
15	1	CASH_OUT	229133.94	C905080434	15325.0	0.00	C476402209	5083.0	51513.44	0

	step	amount	oldbalanceOrig	newbalanceOrig	oldbalanceDest	newbalanceDest	isFraud	isFlaggedFraud
count	4.204958e+06	4.204958e+06	4.204958e+06	4.204958e+06	4.204958e+06	4.204958e+06	4.204958e+06	4.204958e+06
mean	2.427839e+02	2.650655e+05	1.225089e+06	1.260426e+06	1.662837e+06	1.850621e+06	1.945085e-03	3.805032e-06
std	1.421379e+02	7.275209e+05	3.482954e+06	3.523145e+06	4.064136e+06	4.385451e+06	4.406021e-02	1.950646e-03
min	1.000000e+00	1.000000e-02	0.000000e+00	0.000000e+00	0.000000e+00	0.000000e+00	0.000000e+00	0.000000e+00
25%	1.550000e+02	7.608690e+04	0.000000e+00	0.000000e+00	1.396720e+05	2.217264e+05	0.000000e+00	0.000000e+00
50%	2.370000e+02	1.589422e+05	1.815700e+04	0.000000e+00	5.512318e+05	6.837139e+05	0.000000e+00	0.000000e+00
75%	3.340000e+02	2.784826e+05	1.911609e+05	2.849697e+05	1.692878e+06	1.910872e+06	0.000000e+00	0.000000e+00
max	7.430000e+02	9.244552e+07	5.958504e+07	4.958504e+07	3.560159e+08	3.561793e+08	1.000000e+00	1.000000e+00

	step	amount	oldbalanceOrig	newbalanceOrig	oldbalanceDest	newbalanceDest	isFraud	isFlaggedFraud
count	2.148333e+06	2.148333e+06	2.148333e+06	2.148333e+06	2148333.0	2148333.0	2148333.0	2148333.0
mean	2.442619e+02	1.305651e+04	6.822677e+04	6.184797e+04	0.0	0.0	0.0	0.0
std	1.426877e+02	1.255499e+04	1.990730e+05	1.970739e+05	0.0	0.0	0.0	0.0
min	1.000000e+00	2.000000e-02	0.000000e+00	0.000000e+00	0.0	0.0	0.0	0.0
25%	1.560000e+02	4.383570e+03	0.000000e+00	0.000000e+00	0.0	0.0	0.0	0.0
50%	2.490000e+02	9.481460e+03	1.053000e+04	0.000000e+00	0.0	0.0	0.0	0.0
75%	3.350000e+02	1.756005e+04	6.089100e+04	4.966411e+04	0.0	0.0	0.0	0.0
max	7.180000e+02	2.386380e+05	4.368662e+07	4.367380e+07	0.0	0.0	0.0	0.0

Creating and Evaluating a Model to Detect Fraud Using Synthetic Payment Data¶

Overview¶

Step 1 - Initial Exploratory Data Analysis.¶

Step 2 - Training a Basic Logistic Regression Model to Evaluate Performance and Model Assumptions.¶

Step 3 - Considering an alternative model¶

	step	type	nameOrig	nameDest	oldbalanceDest	newbalanceDest	isFraud
2736447	212	CASH_OUT	C1510987794	C1696624817	0.00	0.00	1
3247298	250	CASH_OUT	C521393327	C480398193	0.00	0.00	1
3760289	279	CASH_OUT	C539112012	C1106468520	538547.63	538547.63	1
5563714	387	CASH_OUT	C1294472700	C1325541393	7970766.57	7970766.57	1
5996408	425	CASH_OUT	C832555372	C1462759334	76759.90	76759.90	1
5996410	425	CASH_OUT	C69493310	C719711728	2921531.34	2921531.34	1
6168500	554	CASH_OUT	C10965156	C1493336195	230289.66	230289.66	1
6205440	586	CASH_OUT	C1303719003	C900608348	1328472.86	1328472.86	1
6266414	617	CASH_OUT	C1971175979	C1352345416	0.00	0.00	1
6281483	646	CASH_OUT	C2060908932	C1587892888	0.00	0.00	1
6281485	646	CASH_OUT	C1997645312	C601248796	0.00	0.00	1
6296015	671	CASH_OUT	C1960007029	C459118517	27938.72	27938.72	1
6351226	702	CASH_OUT	C1461113533	C1382150537	107777.02	107777.02	1
6362461	730	CASH_OUT	C729003789	C1388096959	1008609.53	1008609.53	1
6362463	730	CASH_OUT	C2088151490	C1156763710	0.00	0.00	1
6362585	741	CASH_OUT	C312737633	C1400061387	267522.87	267522.87	1

	Feature	Mutual Info with Target
0	type	0.170757
2	oldbalanceOrig	0.002614
1	amount	0.002437
3	newbalanceOrig	0.000633
4	oldbalanceDest	0.000167
5	newbalanceDest	0.000113

Dep. Variable:	isFraud	No. Observations:	13610
Model:	GLM	Df Residuals:	13604
Model Family:	Binomial	Df Model:	5
Link Function:	Logit	Scale:	1.0000
Method:	IRLS	Log-Likelihood:	-6285.4
Date:	Wed, 26 Oct 2022	Deviance:	12571.
Time:	13:52:38	Pearson chi2:	5.83e+04
No. Iterations:	8	Pseudo R-squ. (CS):	0.3450
Covariance Type:	nonrobust

	coef	std err	z	P>\|z\|	[0.025	0.975]
type	0.4805	0.016	29.744	0.000	0.449	0.512
amount	2.212e-05	7.02e-07	31.515	0.000	2.07e-05	2.35e-05
oldbalanceOrig	-9.732e-07	2.12e-07	-4.588	0.000	-1.39e-06	-5.57e-07
log_amount	-1.326e-06	4.47e-08	-29.663	0.000	-1.41e-06	-1.24e-06
log_oldbalanceOrig	4.728e-08	1.24e-08	3.803	0.000	2.29e-08	7.16e-08
const	-1.8001	0.051	-35.326	0.000	-1.900	-1.700

	mean_fit_time	std_fit_time	mean_score_time	std_score_time	param_C	param_gamma	params	split0_test_score	split1_test_score	split2_test_score	split3_test_score	split4_test_score	mean_test_score	std_test_score	rank_test_score
22	30.938518	6.145725	0.049392	0.001331	10000000000.0	1.0	{'C': 10000000000.0, 'gamma': 1.0}	0.972696	0.974071	0.970940	0.975986	0.965517	0.971842	0.003569	1
17	4.847964	1.539862	0.072410	0.001604	100000.0	1.0	{'C': 100000.0, 'gamma': 1.0}	0.968207	0.971698	0.964816	0.968976	0.957071	0.966153	0.005044	2
13	1.670781	0.061041	0.381077	0.002482	1.0	100000.0	{'C': 1.0, 'gamma': 100000.0}	0.951043	0.943861	0.951649	0.940693	0.941446	0.945738	0.004701	3
18	2.389618	0.219896	0.354206	0.001954	100000.0	100000.0	{'C': 100000.0, 'gamma': 100000.0}	0.942844	0.936151	0.943379	0.935232	0.931305	0.937782	0.004649	4
12	0.440423	0.004139	0.188555	0.001574	1.0	1.0	{'C': 1.0, 'gamma': 1.0}	0.887563	0.894785	0.883912	0.904084	0.903811	0.894831	0.008226	5

	Feature	VIF
0	type	1.176049
1	amount	2.060868
2	oldbalanceOrig	1.867042