-
Notifications
You must be signed in to change notification settings - Fork 0
/
spam_classifier_final.py
125 lines (91 loc) · 3.62 KB
/
spam_classifier_final.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
"""
_ _
| | | |
__ _ _ __ _ _ __ _ _ __ ___| |__ | |__
/ _` | '__| | | |/ _` | '_ \/ __| '_ \| '_ \
| (_| | | | |_| | (_| | | | \__ \ | | | |_) |
\__,_|_| \__, |\__,_|_| |_|___/_| |_|_.__/
__/ |
|___/
"""
import pandas as pd
import numpy as np
# this sec is to make a dictionary of the words in the email
vocabulary = {}
data = pd.read_csv("data/emails.csv")
set_words = set(map(str.strip, open('words.txt'))) # words.txt ocntains all the words in the english dectionary (scraped from internet)
# link for words.txt: https://github.com/aryanshb/english-words/blob/master/words.txt
def build_vocabulary(curr_email):
idx = len(vocabulary)
for word in curr_email:
if word.lower() not in vocabulary and word.lower() in set_words:
vocabulary[word] = idx
idx += 1
if __name__ == "__main__":
for i in range(data.shape[0]):
curr_email = data.iloc[i, :][0].split()
print(
f"Current email is {i}/{data.shape[0]} and the \
length of vocab is curr {len(vocabulary)}"
)
build_vocabulary(curr_email)
# Write dictionary to vocabulary.txt file
file = open("vocabulary.txt", "w")
file.write(str(vocabulary))
file.close()
############################################################################
# this section is to map each word to its frequency
file = open("vocabulary.txt", "r")
contents = file.read()
vocabulary = eval(contents)
X = np.zeros((data.shape[0], len(vocabulary)))
y = np.zeros((data.shape[0]))
for i in range(data.shape[0]):
email = data.iloc[i, :][0].split()
for email_word in email:
if email_word.lower() in vocabulary:
X[i, vocabulary[email_word]] += 1
y[i] = data.iloc[i, :][1]
# Save stored numpy arrays
np.save("data/X.npy", X)
np.save("data/y.npy", y)
############################################################################
# this is the implementation of naive bayes algorithm
class NaiveBayes:
def __init__(self, X, y):
self.num_examples, self.num_features = X.shape
self.num_classes = len(np.unique(y))
self.eps = 1e-6
def fit(self, X):
self.classes_mean = {}
self.classes_variance = {}
self.classes_prior = {}
for c in range(self.num_classes):
X_c = X[y == c]
self.classes_mean[str(c)] = np.mean(X_c, axis=0)
self.classes_variance[str(c)] = np.var(X_c, axis=0)
self.classes_prior[str(c)] = X_c.shape[0] / X.shape[0]
def predict(self, X):
probs = np.zeros((self.num_examples, self.num_classes))
for c in range(self.num_classes):
prior = self.classes_prior[str(c)]
probs_c = self.density_function(
X, self.classes_mean[str(c)], self.classes_variance[str(c)]
)
probs[:, c] = probs_c + np.log(prior)
return np.argmax(probs, 1)
def density_function(self, x, mean, sigma):
# Calculate probability from Gaussian density function
# stripped from internet
const = -self.num_features / 2 * np.log(2 * np.pi) - 0.5 * np.sum(np.log(sigma + self.eps))
probs = 0.5 * np.sum(np.power(x - mean, 2) / (sigma + self.eps), 1)
return const - probs
# driver
if __name__ == "__main__":
# run build_vocab to have .npy files
X = np.load("data/X.npy")
y = np.load("data/y.npy")
NB = NaiveBayes(X, y)
NB.fit(X)
y_pred = NB.predict(X)
print(f"Accuracy: {sum(y_pred==y)/X.shape[0]}")