-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathclassify.py
More file actions
164 lines (132 loc) · 6.29 KB
/
classify.py
File metadata and controls
164 lines (132 loc) · 6.29 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
"""Classification of feature-vectors using KNN classifier.
The :class:`KNN` class contains the classifier. It can :meth:`~KNN.classify` new datapoints
as soon as it is properly trained using the :meth:`~KNN.train` method.
The :meth:`~KNN.test` method provides a way to classify many vectors at once, and
return the classifiers accuracy compared to a gold standard.
:Author: Kjetil Valle <kjetilva@stud.ntnu.no>"""
import nltk
import numpy
import scipy.spatial.distance
from nltk.probability import FreqDist
class KNN:
"""K-nearest neighbors classifier.
Classifier for labeled data in feature-vector format.
Supports k-nearest classification against trained data samples, and
1-nearest classification against class centroids.
"""
def __init__(self, use_centroids=False, k=5):
"""Creates a new untrained classifier.
If *use_centroids*, the classifier will calculate and use the best
matching centroid of each the samples from each class when
classifying. Otherwise the class among the k nearest is used.
*k* is the number of neighbors to use when classifying. Ignored if
centroids are used for classification.
"""
self._use_centroids = use_centroids
self._k = k
self.features = None # features trained against
self.labels = None # corresponding labels
self.classes = None # the set of unique labels used
self.active_features = None # reduced feature-vectors to use for classification
self.centroids = None # centroid vector for each class using all features
self.active_centroids = None # centroid vectors using active features
def train(self, features, labels):
"""Trains the KNN on a set of data.
Uses NxM feature matrix *features* with M samples, each of N features.
See output from :func:`data.read_files`.
The list of *labels* correspond to each of the M samples.
"""
self.labels = labels
self.features = features
self.classes = list(set(labels))
# features initially active
self.active_features = features
if self._use_centroids:
self.centroids = self._calculate_centroids()
self.active_centroids = self.centroids
def _calculate_centroids(self):
"""Calcualtes centroid vectors for each class in the list of labels.
The centroids are calculated as the geometric mean of all document
feature vectors belonging to that class.
"""
fs = None
for c in self.classes:
indices = numpy.array([self.labels[i]==c for i in range(len(self.labels))])
cluster = self.features[:,indices]
centroid = numpy.average(cluster, axis=1)
centroid = numpy.reshape(centroid,(len(centroid),1))
if fs is None:
fs = centroid
else:
fs = numpy.hstack((fs, centroid))
return fs
def classify(self, qs, distance_metric='cosine'):
"""Classifies a list of query cases.
When classifying only those features that are **active** are
used, all other features are ignored. The set of active features
can be changed by :func:`~KNN.set_active_features`.
Feature matrix *qs* is similar to that used in :func:`~KNN.train`, i.e a NxM
matrix where N is number of features and M documents.
The string *distance_metric* defines what metric to use when comparing feture vectors.
See http://docs.scipy.org/doc/scipy/reference/spatial.distance.html#scipy.spatial.distance.cdist
for list of supported metrics.
Returns classification of each of the input cases.
"""
if len(qs[:,0]) != len(self.active_features[:,0]):
got = len(qs[:,0])
expected = len(self.active_features[:,0])
raise ValueError, "Wrong number of features in queries. got {0}, expected {1}".format(got, expected)
# calculate distance matrix
if self._use_centroids:
dm = scipy.spatial.distance.cdist(self.active_centroids.T, qs.T, distance_metric)
ans = self.classes
else:
dm = scipy.spatial.distance.cdist(self.active_features.T, qs.T, distance_metric)
ans = self.labels
# classify queries
results = []
for q in range(len(qs[0])):
if q%100==0: print ' query',str(q)+'/'+str(len(qs[0]))
if self._k==1:
# No need to vote; dictator decides..
i = numpy.argsort(dm[:,q])[0]
results.append(ans[i])
else:
indices = numpy.argsort(dm[:,q])
indices = indices[:self._k] # only the k closest
# Marjority vote amongst the k
votes = []
for i in indices:
votes.append(ans[i])
winner = max(set(votes), key=votes.count)
results.append(winner)
return results
def set_active_features(self, list=None):
"""Changes the set of active feature.
Takes a *list* of features to make active.
Could either be a list of feature indices, or boolean list
with length equal to number of features where `true` == active.
If `None`, all features are activated.
"""
if list is None:
# Use all features
self.active_features = self.features
if self._use_centroids:
self.active_centroids = self.centroids
else:
# Use specified features
self.active_features = self.features[list,:]
if self._use_centroids:
self.active_centroids = self.centroids[list,:]
def test(self, features, gold):
"""Tests this classifier against a set of labeled data.
It is assumed that the classifier has been trained before
this method is called.
*features* is a NxM (features x documents) feature matrix, and
*gold* a list of labels belonging to each of the documents in the
feature matrix.
Returns the accuracy of the classifier over the training data.
"""
results = self.classify(features)
accuracy = nltk.metrics.scores.accuracy(gold, results)
return accuracy