Kaggle Machine Learning Competition: Phishing Email Detection#

Link to private Kaggle competition

Import modules#

import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib_inline.backend_inline
matplotlib_inline.backend_inline.set_matplotlib_formats("svg")
plt.rcParams["figure.facecolor"] = "white"

Read and split the data#

from sklearn.model_selection import train_test_split

def process_data(data):
    # return data.str.encode("ascii", "ignore").str.decode("ascii")
    return data

home_dir = os.path.expanduser("~")
local_dir = f"{home_dir}/github/KaggleCompetitionSpamNotSpam/datasets"

train_data = pd.read_csv(f"{local_dir}/train.csv", index_col=0)
test_data = pd.read_csv(f"{local_dir}/test.csv", index_col=0)

train_data["email"] = process_data(train_data["email"])
test_data["email"] = process_data(test_data["email"])

train, val = train_test_split(train_data, test_size=0.15, random_state=17, stratify=train_data["label"])

print(f"Train data shape: {train.shape}")
print(f"Validation data shape: {val.shape}")
Train data shape: (6386, 4)
Validation data shape: (1127, 4)

Feature extraction - Vectorize words#

from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(analyzer="word", ngram_range=(1, 2), lowercase=True)

X_train_counts = count_vect.fit_transform(train["email"])
ser_features_names = pd.Series(count_vect.get_feature_names_out())
ser_features_names.to_frame("feature_names").info()
class 'pandas.core.frame.DataFrame'
RangeIndex: 663546 entries, 0 to 663545
Data columns (total 1 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   feature_names  663546 non-null  object
dtypes: object(1)
memory usage: 5.1+ MB

Feature extraction - Find importance of words#

from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer()

X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
ser_features_tfidf = pd.Series(tfidf_transformer.idf_)
ser_features_tfidf.to_frame("importance").info()
class 'pandas.core.frame.DataFrame'
RangeIndex: 663546 entries, 0 to 663545
Data columns (total 1 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   importance  663546 non-null  float64
dtypes: float64(1)
memory usage: 5.1 MB

Model Selection, training, and testing#

from sklearn.linear_model import SGDClassifier
from sklearn import metrics

text_clf = SGDClassifier(loss="hinge", penalty="l2",
                         alpha=1e-6, random_state=17,
                         max_iter=2e3, tol=1e-12, 
                         fit_intercept=True)

text_clf.fit(X_train_tfidf, train["label"])  
y_pred = text_clf.predict(tfidf_transformer.transform(count_vect.transform(val["email"])))
    
print(metrics.classification_report(val["label"], y_pred, target_names=["not spam", "spam"], digits=4))
              precision    recall  f1-score   support

    not spam     0.9952    0.9952    0.9952       838
        spam     0.9862    0.9862    0.9862       289

    accuracy                         0.9929      1127
   macro avg     0.9907    0.9907    0.9907      1127
weighted avg     0.9929    0.9929    0.9929      1127

Visualize the results#

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(10, 4))

cm = metrics.confusion_matrix(val["label"], y_pred)
cm_display = metrics.ConfusionMatrixDisplay(cm).plot(ax=axes[0], cmap=plt.cm.Oranges)

precision, recall, _ = metrics.precision_recall_curve(val["label"], y_pred)
pr_display = metrics.PrecisionRecallDisplay(precision=precision, recall=recall).plot(ax=axes[1], color="coral")
Confusion Matrix and Precision vs Recall Plot

Submit the results#

y_submit = text_clf.predict(tfidf_transformer.transform(count_vect.transform(test_data["email"])))

df_submit = test_data.reset_index()[["id"]].copy().set_index("id")
df_submit["label"] = y_submit
df_submit.to_csv(f"{local_dir}/submit_REV2.csv")

Click here to access GitHub repository