-
Notifications
You must be signed in to change notification settings - Fork 45
/
Copy pathclassifiers.py
135 lines (111 loc) · 5.43 KB
/
classifiers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
from collections import Counter, defaultdict
import math
import pprint
from decorators import memoize # Advanced material
class NaiveBayesClassifier(object):
def __init__(self, laplace_smoothing_constant=0.01):
self.total_counter = 0
self.class_counter = Counter()
self.feature_given_class_counter = defaultdict(Counter)
# Hyperparameter that can be tuned via Cross Validation to improve performance
self.laplace_smoothing_constant = laplace_smoothing_constant
def _update_with_one_data_point(self, data_point):
# Increment the total counter
self.total_counter += 1
# Increment class_counter
self.class_counter[data_point.klass] += 1
# Increment feature_given_class counter for each feature in featuredict
for feature_name, feature_value in data_point.featuredict.items():
assert type(feature_value) == int, "only int typed feature values currently supported"
# Bonus (advanced): can one extend Naive Bayes to real-valued features? (hint: yes ;)
self.feature_given_class_counter[data_point.klass][feature_name] += feature_value
def train(self, train_set, verbose=False):
for data_point in train_set:
self._update_with_one_data_point(data_point)
if verbose:
print("Training complete. Counters:")
pprint.pprint(self.total_counter)
pprint.pprint(self.class_counter)
pprint.pprint(self.feature_given_class_counter)
@memoize # Advanced material, see note on memoize
def _prior(self, klass):
# Laplace smoothing
numerator = self.laplace_smoothing_constant
denominator = len(self.class_counter) * self.laplace_smoothing_constant
# On top of the unsmoothed counts
numerator += self.class_counter[klass]
denominator += self.total_counter
# Gives us our smoothed prior
return float(numerator) / denominator
@memoize # Advanced material, see note on memoize
def _vocabulary_size(self):
vocab = set()
for klass in self.class_counter: # for each class
# get all the features in class and add them to total cross-class vocabulary
vocab.update(set(self.feature_given_class_counter[klass]))
return len(vocab)
@memoize # Advanced material, see note on memoize
def _likelihood(self, feature_name, klass):
# Laplace smoothing
numerator = self.laplace_smoothing_constant
denominator = self._vocabulary_size() * self.laplace_smoothing_constant
# On top of the unsmoothed counts
numerator += self.feature_given_class_counter[klass].get(feature_name, 0)
denominator += sum(self.feature_given_class_counter[klass].values())
# Gives us our smoothed likelihood
return float(numerator) / denominator
def predict(self, data_point, verbose=False):
# Where we'll store probabilities by class
pseudo_probability_by_class = {}
# Calculate the pseudo probability for each class
for klass in self.class_counter:
prior = self._prior(klass)
# Aggregate likelihood
likelihoods = []
for feature_name in data_point.featuredict: # for each feature
# for each time the feature appeared
for _ in range(data_point.featuredict[feature_name]):
likelihoods.append(self._likelihood(feature_name, klass))
# Add prior and likelihoods in logspace to avoid floating point underflow.
# The class with the highest log probability is still the most probable.
numerator_terms = [prior] + likelihoods
pseudo_probability_by_class[klass] = sum([math.log(t) for t in numerator_terms])
# Pick the class with the maximum probability and return it as our prediction
sorted_probability_by_class = sorted(pseudo_probability_by_class.items(),
# Sorts ascending by default, we want biggest probability first => descending
key=lambda x: x[1], reverse=True)
prediction = sorted_probability_by_class[0][0]
if verbose:
print("Predicting: {}".format(prediction))
return prediction
def evaluate_classifier(classifier, class_of_interest,
evaluation_data, verbose=False, progress=True):
if verbose:
print("Evaluating performance for class {}".format(class_of_interest))
tp, fp, tn, fn = 0, 0, 0, 0 # true positive, false positive, true negative, false negative
count = 0
for dp in evaluation_data:
count += 1
if progress:
if count % 1000 == 0:
print("progress: {} / {}".format(count, len(evaluation_data)))
prediction = classifier.predict(dp)
actual = dp.klass
if actual == prediction: # we got it right!
if prediction == class_of_interest:
tp += 1
else:
tn += 1
else: # we got it wrong :(
if prediction == class_of_interest:
fp += 1
else:
fn += 1
precision = float(tp) / (tp + fp)
recall = float(tp) / (tp + fn)
f1 = 2 * precision * recall / (precision + recall)
if verbose:
print("precision:", precision)
print("recall:", recall)
print("f1:", f1)
return f1, precision, recall