import math
import numpy as np
import matplotlib.pylab as plt
%matplotlib inline

# importing all the required libraries

from math import exp
import numpy as np
import pandas as pd
import seaborn as sns
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from matplotlib.colors import Normalize
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

!pip install ucimlrepo

Requirement already satisfied: ucimlrepo in /opt/conda/lib/python3.7/site-packages (0.0.7)
Requirement already satisfied: pandas>=1.0.0 in /opt/conda/lib/python3.7/site-packages (from ucimlrepo) (1.0.3)
Requirement already satisfied: certifi>=2020.12.5 in /opt/conda/lib/python3.7/site-packages (from ucimlrepo) (2025.1.31)
Requirement already satisfied: python-dateutil>=2.6.1 in /opt/conda/lib/python3.7/site-packages (from pandas>=1.0.0->ucimlrepo) (2.8.1)
Requirement already satisfied: pytz>=2017.2 in /opt/conda/lib/python3.7/site-packages (from pandas>=1.0.0->ucimlrepo) (2020.1)
Requirement already satisfied: numpy>=1.13.3 in /opt/conda/lib/python3.7/site-packages (from pandas>=1.0.0->ucimlrepo) (1.18.4)
Requirement already satisfied: six>=1.5 in /opt/conda/lib/python3.7/site-packages (from python-dateutil>=2.6.1->pandas>=1.0.0->ucimlrepo) (1.14.0)

[notice] A new release of pip is available: 23.3.2 -> 24.0
[notice] To update, run: pip install --upgrade pip

from ucimlrepo import fetch_ucirepo

# fetch dataset
spambase = fetch_ucirepo(id=94)
  
# data (as pandas dataframes)
X = spambase.data.features
y = spambase.data.targets
  
# variable information
print(spambase.variables)

# metadata
spambase.metadata

                          name     role        type demographic  \
0               word_freq_make  Feature  Continuous        None   
1            word_freq_address  Feature  Continuous        None   
2                word_freq_all  Feature  Continuous        None   
3                 word_freq_3d  Feature  Continuous        None   
4                word_freq_our  Feature  Continuous        None   
5               word_freq_over  Feature  Continuous        None   
6             word_freq_remove  Feature  Continuous        None   
7           word_freq_internet  Feature  Continuous        None   
8              word_freq_order  Feature  Continuous        None   
9               word_freq_mail  Feature  Continuous        None   
10           word_freq_receive  Feature  Continuous        None   
11              word_freq_will  Feature  Continuous        None   
12            word_freq_people  Feature  Continuous        None   
13            word_freq_report  Feature  Continuous        None   
14         word_freq_addresses  Feature  Continuous        None   
15              word_freq_free  Feature  Continuous        None   
16          word_freq_business  Feature  Continuous        None   
17             word_freq_email  Feature  Continuous        None   
18               word_freq_you  Feature  Continuous        None   
19            word_freq_credit  Feature  Continuous        None   
20              word_freq_your  Feature  Continuous        None   
21              word_freq_font  Feature  Continuous        None   
22               word_freq_000  Feature  Continuous        None   
23             word_freq_money  Feature  Continuous        None   
24                word_freq_hp  Feature  Continuous        None   
25               word_freq_hpl  Feature  Continuous        None   
26            word_freq_george  Feature  Continuous        None   
27               word_freq_650  Feature  Continuous        None   
28               word_freq_lab  Feature  Continuous        None   
29              word_freq_labs  Feature  Continuous        None   
30            word_freq_telnet  Feature  Continuous        None   
31               word_freq_857  Feature  Continuous        None   
32              word_freq_data  Feature  Continuous        None   
33               word_freq_415  Feature  Continuous        None   
34                word_freq_85  Feature  Continuous        None   
35        word_freq_technology  Feature  Continuous        None   
36              word_freq_1999  Feature  Continuous        None   
37             word_freq_parts  Feature  Continuous        None   
38                word_freq_pm  Feature  Continuous        None   
39            word_freq_direct  Feature  Continuous        None   
40                word_freq_cs  Feature  Continuous        None   
41           word_freq_meeting  Feature  Continuous        None   
42          word_freq_original  Feature  Continuous        None   
43           word_freq_project  Feature  Continuous        None   
44                word_freq_re  Feature  Continuous        None   
45               word_freq_edu  Feature  Continuous        None   
46             word_freq_table  Feature  Continuous        None   
47        word_freq_conference  Feature  Continuous        None   
48                 char_freq_;  Feature  Continuous        None   
49                 char_freq_(  Feature  Continuous        None   
50                 char_freq_[  Feature  Continuous        None   
51                 char_freq_!  Feature  Continuous        None   
52                 char_freq_$  Feature  Continuous        None   
53                 char_freq_#  Feature  Continuous        None   
54  capital_run_length_average  Feature  Continuous        None   
55  capital_run_length_longest  Feature  Continuous        None   
56    capital_run_length_total  Feature  Continuous        None   
57                       Class   Target      Binary        None   

                 description units missing_values  
0                       None  None             no  
1                       None  None             no  
2                       None  None             no  
3                       None  None             no  
4                       None  None             no  
5                       None  None             no  
6                       None  None             no  
7                       None  None             no  
8                       None  None             no  
9                       None  None             no  
10                      None  None             no  
11                      None  None             no  
12                      None  None             no  
13                      None  None             no  
14                      None  None             no  
15                      None  None             no  
16                      None  None             no  
17                      None  None             no  
18                      None  None             no  
19                      None  None             no  
20                      None  None             no  
21                      None  None             no  
22                      None  None             no  
23                      None  None             no  
24                      None  None             no  
25                      None  None             no  
26                      None  None             no  
27                      None  None             no  
28                      None  None             no  
29                      None  None             no  
30                      None  None             no  
31                      None  None             no  
32                      None  None             no  
33                      None  None             no  
34                      None  None             no  
35                      None  None             no  
36                      None  None             no  
37                      None  None             no  
38                      None  None             no  
39                      None  None             no  
40                      None  None             no  
41                      None  None             no  
42                      None  None             no  
43                      None  None             no  
44                      None  None             no  
45                      None  None             no  
46                      None  None             no  
47                      None  None             no  
48                      None  None             no  
49                      None  None             no  
50                      None  None             no  
51                      None  None             no  
52                      None  None             no  
53                      None  None             no  
54                      None  None             no  
55                      None  None             no  
56                      None  None             no  
57  spam (1) or not spam (0)  None             no

{'uci_id': 94,
 'name': 'Spambase',
 'repository_url': 'https://archive.ics.uci.edu/dataset/94/spambase',
 'data_url': 'https://archive.ics.uci.edu/static/public/94/data.csv',
 'abstract': 'Classifying Email as Spam or Non-Spam',
 'area': 'Computer Science',
 'tasks': ['Classification'],
 'characteristics': ['Multivariate'],
 'num_instances': 4601,
 'num_features': 57,
 'feature_types': ['Integer', 'Real'],
 'demographics': [],
 'target_col': ['Class'],
 'index_col': None,
 'has_missing_values': 'no',
 'missing_values_symbol': None,
 'year_of_dataset_creation': 1999,
 'last_updated': 'Mon Aug 28 2023',
 'dataset_doi': '10.24432/C53G6X',
 'creators': ['Mark Hopkins',
  'Erik Reeber',
  'George Forman',
  'Jaap Suermondt'],
 'intro_paper': None,
 'additional_info': {'summary': 'The "spam" concept is diverse: advertisements for products/web sites, make money fast schemes, chain letters, pornography...\n\nThe classification task for this dataset is to determine whether a given email is spam or not.\n\t\nOur collection of spam e-mails came from our postmaster and individuals who had filed spam.  Our collection of non-spam e-mails came from filed work and personal e-mails, and hence the word \'george\' and the area code \'650\' are indicators of non-spam.  These are useful when constructing a personalized spam filter.  One would either have to blind such non-spam indicators or get a very wide collection of non-spam to generate a general purpose spam filter.\n\nFor background on spam: Cranor, Lorrie F., LaMacchia, Brian A.  Spam!, Communications of the ACM, 41(8):74-83, 1998.\n\nTypical performance is around ~7% misclassification error. False positives (marking good mail as spam) are very undesirable.If we insist on zero false positives in the training/testing set, 20-25% of the spam passed through the filter. See also Hewlett-Packard Internal-only Technical Report. External version forthcoming. ',
  'purpose': None,
  'funded_by': None,
  'instances_represent': 'Emails',
  'recommended_data_splits': None,
  'sensitive_data': None,
  'preprocessing_description': None,
  'variable_info': 'The last column of \'spambase.data\' denotes whether the e-mail was considered spam (1) or not (0), i.e. unsolicited commercial e-mail.  Most of the attributes indicate whether a particular word or character was frequently occuring in the e-mail.  The run-length attributes (55-57) measure the length of sequences of consecutive capital letters.  For the statistical measures of each attribute, see the end of this file.  Here are the definitions of the attributes:\r\n\r\n48 continuous real [0,100] attributes of type word_freq_WORD \r\n= percentage of words in the e-mail that match WORD, i.e. 100 * (number of times the WORD appears in the e-mail) / total number of words in e-mail.  A "word" in this case is any string of alphanumeric characters bounded by non-alphanumeric characters or end-of-string.\r\n\r\n6 continuous real [0,100] attributes of type char_freq_CHAR] \r\n= percentage of characters in the e-mail that match CHAR, i.e. 100 * (number of CHAR occurences) / total characters in e-mail\r\n\r\n1 continuous real [1,...] attribute of type capital_run_length_average \r\n= average length of uninterrupted sequences of capital letters\r\n\r\n1 continuous integer [1,...] attribute of type capital_run_length_longest \r\n= length of longest uninterrupted sequence of capital letters\r\n\r\n1 continuous integer [1,...] attribute of type capital_run_length_total \r\n= sum of length of uninterrupted sequences of capital letters \r\n= total number of capital letters in the e-mail\r\n\r\n1 nominal {0,1} class attribute of type spam\r\n= denotes whether the e-mail was considered spam (1) or not (0), i.e. unsolicited commercial e-mail.  \r\n',
  'citation': None}}

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=10, test_size=.1)

# Adapted from Module 6, used by plotSearchGrid
class MidpointNormalize(Normalize):

    def __init__(self, vmin=None, vmax=None, midpoint=None, clip=False):
        self.midpoint = midpoint
        Normalize.__init__(self, vmin, vmax, clip)

    def __call__(self, value, clip=None):
        x, y = [self.vmin, self.midpoint, self.vmax], [0, 0.5, 1]
        return np.ma.masked_array(np.interp(value, x, y))

# Adapted from Module 6
def plotSearchGrid(grid, var1, var2):
    scores = [x for x in grid.cv_results_["mean_test_score"]]
    scores = np.array(scores).reshape(len(grid.param_grid[var1]), len(grid.param_grid[var2]))
    #plt.figure(figsize=(10, 8))
    plt.subplots_adjust(left=.2, right=0.95, bottom=0.15, top=0.95)
    plt.imshow(scores, interpolation='nearest', cmap=plt.cm.hot,
               norm=MidpointNormalize(vmin=0.2, midpoint=0.92))
    plt.xlabel(var2)
    plt.ylabel(var1)
    plt.colorbar()
    plt.xticks(np.arange(len(grid.param_grid[var2])), grid.param_grid[var2], rotation=45)
    plt.yticks(np.arange(len(grid.param_grid[var1])), grid.param_grid[var1])
    plt.title('Validation accuracy')
    plt.show()

def calculate_precision(y_true, y_pred, pos_label_value=1.0):
    '''
    This function accepts the labels and the predictions, then
    calculates precision for a binary classifier.
    
    Args
        y_true: np.ndarray
        y_pred: np.ndarray
        
        pos_label_value: (float) the number which represents the postiive
        label in the y_true and y_pred arrays. Other numbers will be taken
        to be the non-positive class for the binary classifier.
    
    Returns precision as a floating point number between 0.0 and 1.0
    '''
    TP = np.sum((y_true == pos_label_value) * (y_pred == pos_label_value))
    P_data = np.sum(y_pred == pos_label_value)
    return TP/P_data

def calculate_recall(y_true, y_pred, pos_label_value=1.0):
    '''
    This function accepts the labels and the predictions, then
    calculates recall for a binary classifier.
    
    Args
        y_true: np.ndarray
        y_pred: np.ndarray
        
        pos_label_value: (float) the number which represents the postiive
        label in the y_true and y_pred arrays. Other numbers will be taken
        to be the non-positive class for the binary classifier.
    
    Returns precision as a floating point number between 0.0 and 1.0
    '''
    TP = np.sum((y_true == pos_label_value) * (y_pred == pos_label_value))
    P_data = np.sum(y_true == pos_label_value)
    return TP/P_data

sns.heatmap(X.corr(), -1, 1, "coolwarm", 0, annot=True, ax=plt.subplots(figsize=(30,30))[1])

<matplotlib.axes._subplots.AxesSubplot at 0x7535ac4d7bd0>

sns.pairplot(X[["word_freq_hp", "word_freq_hpl", "word_freq_650", "word_freq_lab", "word_freq_labs", "word_freq_telnet", "word_freq_857", "word_freq_415", "word_freq_85", "word_freq_technology", "word_freq_direct"]], diag_kind="kde")

<seaborn.axisgrid.PairGrid at 0x75353521aa90>

sns.pairplot(X[["capital_run_length_average", "capital_run_length_longest", "capital_run_length_total"]], diag_kind="kde")

<seaborn.axisgrid.PairGrid at 0x75352d98bc90>

LogRegGrid = GridSearchCV(
    LogisticRegression(solver="liblinear"),
    {
        "penalty": ("l1", "l2"),
        "C": np.logspace(-5, 5, base=2, num=11)
    }
).fit(X_train, y_train)

plotSearchGrid(LogRegGrid, "C", "penalty")
print("Best Logistic Regression parameters:", LogRegGrid.best_params_)
print("Best Logistic Regression score:", LogRegGrid.best_score_)
LogRegBest = LogRegGrid.best_estimator_

Best Logistic Regression parameters: {'C': 8.0, 'penalty': 'l1'}
Best Logistic Regression score: 0.9231884057971016

for feature, coef in zip(X.columns.tolist(), LogRegBest.coef_[0]):
    print(feature, ":", coef)

word_freq_make : -0.3352771610625865
word_freq_address : -0.144134104466229
word_freq_all : 0.112018672406649
word_freq_3d : 2.076387357353494
word_freq_our : 0.554593550485832
word_freq_over : 1.2904401963364194
word_freq_remove : 2.5267285725399806
word_freq_internet : 0.5172196645160981
word_freq_order : 0.627473532037221
word_freq_mail : 0.17249868699823487
word_freq_receive : -0.25581126978924495
word_freq_will : -0.14234046357326724
word_freq_people : -0.11854115705721327
word_freq_report : 0.17412149671014215
word_freq_addresses : 1.0500388675253347
word_freq_free : 0.9634466906627731
word_freq_business : 0.9305392973545669
word_freq_email : 0.10133392987631257
word_freq_you : 0.08488243536713497
word_freq_credit : 1.3819517056872157
word_freq_your : 0.2654681374949797
word_freq_font : 0.21749820028292105
word_freq_000 : 2.0810250283128378
word_freq_money : 0.36561247488532317
word_freq_hp : -1.7902775548167147
word_freq_hpl : -1.2336732801585475
word_freq_george : -10.880824805770052
word_freq_650 : 0.44226034330510694
word_freq_lab : -2.2885745842672742
word_freq_labs : -0.2654647593774336
word_freq_telnet : -0.16163430120735922
word_freq_857 : 0.0
word_freq_data : -0.785684361230612
word_freq_415 : 0.45936893654380306
word_freq_85 : -1.9259154363336377
word_freq_technology : 0.8586013549791187
word_freq_1999 : 0.09105557504327932
word_freq_parts : -0.5351601870000458
word_freq_pm : -0.7897860751763727
word_freq_direct : -0.3114823079515479
word_freq_cs : -16.272347427710518
word_freq_meeting : -2.513144116975214
word_freq_original : -1.1180888218223113
word_freq_project : -1.6215101779132928
word_freq_re : -0.8091323320430137
word_freq_edu : -1.307624281598115
word_freq_table : -1.3921522502944936
word_freq_conference : -3.6846542873407873
char_freq_; : -1.233208442411895
char_freq_( : -0.13236694620475534
char_freq_[ : -0.545299178734433
char_freq_! : 0.3144554745969651
char_freq_$ : 4.90464860192831
char_freq_# : 1.85535122747682
capital_run_length_average : 0.014264456456158873
capital_run_length_longest : 0.007832612484971016
capital_run_length_total : 0.0008400454072136723

RFGrid = GridSearchCV(
    RandomForestClassifier(max_samples=.7),
    {
        "max_depth": range(10, 20, 3),
        "n_estimators": range(100, 1100, 300)
    }
).fit(X_train, y_train)

plotSearchGrid(RFGrid, "max_depth", "n_estimators")
print("Best Random Forest parameters:", RFGrid.best_params_)
print("Best Random Forest score:", RFGrid.best_score_)
RFBest = RFGrid.best_estimator_

Best Random Forest parameters: {'max_depth': 19, 'n_estimators': 700}
Best Random Forest score: 0.9487922705314009

ABGrid = GridSearchCV(
    AdaBoostClassifier(),
    {
        "learning_rate": np.logspace(-4, -1, base=2, num=4),
        "n_estimators": range(100, 1100, 300)
    }
).fit(X_train, y_train)

plotSearchGrid(ABGrid, "learning_rate", "n_estimators")
print("Best AdaBoost parameters:", ABGrid.best_params_)
print("Best AdaBoost score:", ABGrid.best_score_)
ABBest = ABGrid.best_estimator_

Best AdaBoost parameters: {'learning_rate': 0.0625, 'n_estimators': 1000}
Best AdaBoost score: 0.9475845410628019

fig, ax = plt.subplots()
ax.set_xlabel("Iteration")
ax.set_ylabel("Error")
ax.set_title("Misclassification error for training and testing sets when using AdaBoost")
ax.plot(1 - np.fromiter(ABBest.staged_score(X_train, y_train), float), label="train")
ax.plot(1 - np.fromiter(ABBest.staged_score(X_test, y_test), float), label="test")
ax.legend()
plt.show()

SVCGrid = GridSearchCV(
    SVC(),
    {
        "gamma": np.logspace(-7, -3, base=2, num=5),
        "C": np.logspace(1, 5, base=2, num=5)
    }
).fit(X_train, y_train)

plotSearchGrid(SVCGrid, "C", "gamma")
print("Best SVM parameters:", SVCGrid.best_params_)
print("Best SVM score:", SVCGrid.best_score_)
SVCBest = SVCGrid.best_estimator_

Best SVM parameters: {'C': 8.0, 'gamma': 0.0078125}
Best SVM score: 0.8657004830917874

SVCGrid2 = GridSearchCV(
    LinearSVC(),
    {
        "loss": ("hinge", "squared_hinge"),
        "C": np.logspace(-15, -5, base=2, num=11)
    }
).fit(X_train, y_train)

plotSearchGrid(SVCGrid2, "C", "loss")
print("Best LinearSVC parameters:", SVCGrid2.best_params_)
print("Best LinearSVC score:", SVCGrid2.best_score_)
LinearSVCBest = SVCGrid2.best_estimator_

Best LinearSVC parameters: {'C': 0.015625, 'loss': 'hinge'}
Best LinearSVC score: 0.9031400966183576

print("Logistic Regression validation score:", LogRegBest.score(X_test, y_test))
print("Random Forest       validation score:", RFBest.score(X_test, y_test))
print("AdaBoost            validation score:", ABBest.score(X_test, y_test))
print("SVM with rbf        validation score:", SVCBest.score(X_test, y_test))
print("Linear SVC          validation score:", LinearSVCBest.score(X_test, y_test))

Logistic Regression validation score: 0.9501084598698482
Random Forest       validation score: 0.9631236442516269
AdaBoost            validation score: 0.9674620390455532
SVM with rbf        validation score: 0.8937093275488069
Linear SVC          validation score: 0.9175704989154013

# These variables store the true and predicted values so we don't need to recalculate them again and again.
y_test_flat = np.ravel(y_test)
LogRegPredict = LogRegBest.predict(X_test)
RFPredict = RFBest.predict(X_test)
ABPredict = ABBest.predict(X_test)
SVCPredict = SVCBest.predict(X_test)
LinearSVCPredict = LinearSVCBest.predict(X_test)

print("Logistic Regression precision score:", calculate_precision(y_test_flat, LogRegPredict))
print("Random Forest       precision score:", calculate_precision(y_test_flat, RFPredict))
print("AdaBoost            precision score:", calculate_precision(y_test_flat, ABPredict))
print("SVM with rbf        precision score:", calculate_precision(y_test_flat, SVCPredict))
print("Linear SVC          precision score:", calculate_precision(y_test_flat, LinearSVCPredict))

Logistic Regression precision score: 0.9481865284974094
Random Forest       precision score: 0.9735449735449735
AdaBoost            precision score: 0.9738219895287958
SVM with rbf        precision score: 0.8808290155440415
Linear SVC          precision score: 0.9247311827956989

print("Logistic Regression recall score:", calculate_recall(y_test_flat, LogRegPredict))
print("Random Forest       recall score:", calculate_recall(y_test_flat, RFPredict))
print("AdaBoost            recall score:", calculate_recall(y_test_flat, ABPredict))
print("SVM with rbf        recall score:", calculate_recall(y_test_flat, SVCPredict))
print("Linear SVC          recall score:", calculate_recall(y_test_flat, LinearSVCPredict))

Logistic Regression recall score: 0.9336734693877551
Random Forest       recall score: 0.9387755102040817
AdaBoost            recall score: 0.9489795918367347
SVM with rbf        recall score: 0.8673469387755102
Linear SVC          recall score: 0.8775510204081632

for label, pred in ("Logistic Regression", LogRegPredict), ("Random Forest", RFPredict), ("AdaBoost", ABPredict), ("SVM", SVCPredict), ("Linear SVC", LinearSVCPredict):
    fpr, tpr, _ = roc_curve(y_test_flat, pred)
    roc_auc = roc_auc_score(y_test_flat, pred)
    plt.plot(fpr, tpr, label=f"micro-average ROC curve (AUC = {roc_auc})")
    plt.title(f"ROC by {label}, AUC={roc_auc}")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.show()

Spam Email Classification¶

Cleaning¶

Can we do better?¶

Preparations¶

Importing packages¶

Import the data¶

Helpful functions¶

EDA¶

Model training¶

Logistic Regression¶

Feature ranking¶

Decision Tree Ensembles¶

SVMs¶

Results¶

Analysis, Discussion and Conclusion¶

Linear Regression¶

Ensembles¶

SVMs¶

Precision vs Recall¶

Takeaway¶