# Natural Language Toolkit: SVM-based classifier
#
# Copyright (C) 2001-2012 NLTK Project
# Author: Leon Derczynski <leon@dcs.shef.ac.uk>
#
# URL: <http://www.nltk.org/>
# For license information, see LICENSE.TXT
"""
A classifier based on a support vector machine. This code uses Thorsten Joachims'
SVM^light implementation (http://svmlight.joachims.org/), wrapped using
PySVMLight (https://bitbucket.org/wcauchois/pysvmlight). The default settings are to
train a linear classification kernel, though through minor modification, full SVMlight
capabilities should be accessible if needed. Only binary classification is possible at present.
"""
from __future__ import print_function
from nltk.probability import DictionaryProbDist
from nltk.classify.api import ClassifierI
#
# Interface to Support Vector Machine
#
try:
import svmlight
except:
raise LookupError("\n\n===========================================================================\n NLTK was unable to import SVMlight!\n\n For more information, see <https://bitbucket.org/wcauchois/pysvmlight>\n===========================================================================")
# create a boolean feature name for the SVM from a feature/value pair,
# that'll take on a 1.0 value if the original feature:value is asserted.
[docs]def featurename(feature, value):
"""
:param feature: a string denoting a feature name
:param value: the value of the feature
"""
return '|'.join([feature, type(value).__name__, str(value)])
# convert a set of NLTK classifier features to SVMlight format
[docs]def map_features_to_svm(features, svmfeatureindex):
"""
:param features: a dict of features in the format {'feature':value}
:param svmfeatureindex: a mapping from feature:value pairs to integer SVMlight feature labels
"""
instancefeatures = []
# svmlight supports sparse feature sets and so we simply omit features that we don't include
for k,v in features.iteritems():
# each feature is represented as an (int, float) tuple where the int is the SVMlight feature label and the float is the value; as we either have or have not a feature, this is 1.0
# this does not support scalar features - rather, each value that a feature may take on is a discrete independent label
# use 1.0 as the feature value to specify the presence of a feature:value couple
svmfeaturename = featurename(k, v)
if svmfeaturename not in svmfeatureindex:
# skip over feature:value pairs that were not in the training data and so not included in our mappings
continue
instancefeatures.append( (svmfeatureindex[svmfeaturename], 1.0) )
return instancefeatures
# convert a whole instance (including label) from NLTK to SVMlight format
[docs]def map_instance_to_svm(instance, labelmapping, svmfeatureindex):
"""
:param instance: an NLTK format instance, which is in the tuple format (dict(), label), where the dict contains feature:value pairs, and the label signifies the target attribute's value for this instance (e.g. its class)
:param labelmapping: a previously-defined dict mapping from text labels in the NLTK instance format to SVMlight labels of either +1 or -1
@svmfeatureindex: a mapping from feature:value pairs to integer SVMlight feature labels
"""
(features, label) = instance
instancefeatures = map_features_to_svm(features, svmfeatureindex)
return (labelmapping[label], instancefeatures)
[docs]class SvmClassifier(ClassifierI):
"""
A Support Vector Machine classifier. To explain briefly, support
vector machines (SVM) treat each feature as a dimension, and
position features in n-dimensional feature space. An optimal
hyperplane is then determined that best divides feature space into
classes, and future instances classified based on which side of
the hyperplane they lie on, and their proximity to it.
This implementation is for a binary SVM - that is, only two
classes are supported. You may achieve perform classification with
more classes by training an SVM per class and then picking a best
option for new instances given results from each binary class-SVM.
"""
def __init__(self, labels, labelmapping, svmfeatures, model=None):
"""
:param labels: a list of text labels for classes
:param labelmapping: a mapping from labels to SVM classes (-1,+1)
:param svmfeatures: a list of svm features, where the index is the integer feature number and the value an feature/value pair
:param model: the model generated by svmlight.learn()
"""
self._labels = labels
self._model = model
self._labelmapping = labelmapping
self._svmfeatures = svmfeatures
# _svmfeatureindex is the inverse of svmfeatures, allowing us
# to find an SVM feature name (int) given a feature/value
self._svmfeatureindex = dict(zip(svmfeatures, range(len(svmfeatures))))
self._verbose = False
[docs] def labels(self):
"""
Return the list of class labels.
"""
return self._labels
[docs] def svm_label_name(self, label):
"""
searches values of _labelmapping to resolve +1 or -1 to a string
:param label: the string label to look up
"""
labelname = [k for k, v in self._labelmapping.iteritems() if v == label][0]
return labelname
[docs] def resolve_prediction(self, prediction):
"""
resolve a float (in this case, probably from
svmlight.learn().classify()) to either -1 or +1, and then look
up the label for that class in _labelmapping, and return the
text label
:param prediction: a signed float describing classifier confidence
"""
classification = cmp(prediction, 0)
return self.svm_label_name(classification)
def _get_svm_classification(self, featureset):
"""
given a set of features, classify them with our trained model
and return a signed float
:param featureset: a dict of feature/value pairs in NLTK format, representing a single instance
"""
instance_to_classify = (0, map_features_to_svm(featureset, self._svmfeatureindex))
if self._verbose:
print('instance', instance_to_classify)
# svmlight.classify expects a list; this should be taken advantage of when writing SvmClassifier.batch_classify / .batch_prob_classify.
# it returns a list of floats, too.
[prediction] = svmlight.classify(self._model, [instance_to_classify])
return prediction
[docs] def prob_classify(self, featureset):
"""
Return a probability distribution of classifications
:param featureset: a dict of feature/value pairs in NLTK format, representing a single instance
"""
if self._model is None:
raise Exception('This classifier is not yet trained')
return None
# do the classification
prediction = self._get_svm_classification(featureset)
if self._verbose:
print('prediction', prediction)
# lump it into a boolean class, -1 or +1
predicted_label = cmp(prediction, 0)
# sometimes the result is not within -1 ... +1; clip it so
# that it is, and we get a sane-looking probability
# distribution. this will upset some results with non-linear
# partitioning where instance-hyperplane distance can be many
# orders of magnitude larger; I don't have a fix for that
if prediction < -1.0:
prediction = -1.0
if prediction > 1.0:
prediction = 1.0
# if the prediction is negative, then we will maximise the
# value of the -1 class; otherwise, that of the 1 class will
# be greater.
if predicted_label == 1:
distribution = {str(self.resolve_prediction(1)): prediction,
str(self.resolve_prediction(-1)): 1 - prediction}
else:
distribution = {str(self.resolve_prediction(1)): prediction + 1,
str(self.resolve_prediction(-1)): -prediction}
return DictionaryProbDist(distribution)
[docs] def classify(self, featureset):
"""
Use a trained SVM to predict a label given for an unlabelled instance
:param featureset: a dict of feature/value pairs in NLTK format, representing a single instance
"""
prediction = self._get_svm_classification(featureset)
if self._verbose:
print('prediction', prediction)
return self.resolve_prediction(prediction)
@staticmethod
[docs] def train(featuresets):
"""
given a set of training instances in nltk format:
[ ( {feature:value, ..}, str(label) ) ]
train a support vector machine
:param featuresets: training instances
"""
# build a unique list of labels
labels = set()
for (features, label) in featuresets:
labels.add(label)
# this is a binary classifier only
if len(labels) > 2:
raise ValueError('Can only do boolean classification (labels: '+ str(labels) + ')')
return False
# we need ordering, so a set's no good
labels = list(labels)
# next, assign -1 and 1
labelmapping = {labels[0]:-1, labels[1]:1}
# now for feature conversion
# iter through instances, building a set of feature:type:str(value) triples
svmfeatures = set()
for (features, label) in featuresets:
for k,v in features.iteritems():
svmfeatures.add(featurename(k, v))
# svmfeatures is indexable by integer svm feature number
# svmfeatureindex is the inverse (svm feature name -> number)
svmfeatures = list(svmfeatures)
svmfeatureindex = dict(zip(svmfeatures, range(len(svmfeatures))))
# build svm feature set case by case
svmfeatureset = []
for instance in featuresets:
svmfeatureset.append(map_instance_to_svm(instance, labelmapping, svmfeatureindex))
# train the svm
# TODO: implement passing of SVMlight parameters from train() to learn()
return SvmClassifier(labels, labelmapping, svmfeatures, svmlight.learn(svmfeatureset, type='classification'))
[docs]def demo():
def gender_features(word):
return {'last_letter': word[-1], 'penultimate_letter': word[-2]}
from nltk.classify import accuracy
from nltk.corpus import names
import random
names = ([(name, 'male') for name in names.words('male.txt')] +
[(name, 'female') for name in names.words('female.txt')])
import random
random.seed(60221023)
random.shuffle(names)
featuresets = [(gender_features(n), g) for (n,g) in names]
train_set, test_set = featuresets[500:], featuresets[:500]
print('--- nltk.classify.svm demo ---')
print('Number of training examples:', len(train_set))
classifier = SvmClassifier.train(train_set)
print('Total SVM dimensions:', len(classifier._svmfeatureindex))
print('Label mapping:', classifier._labelmapping)
print('--- Processing an example instance ---')
print('Reference instance:', names[0])
print('NLTK-format features:\n ' + str(test_set[0]))
print('SVMlight-format features:\n ' + str(map_instance_to_svm(test_set[0], classifier._labelmapping, classifier._svmfeatureindex)))
distr = classifier.prob_classify(test_set[0][0])
print('Instance classification and confidence:', distr.max(), distr.prob(distr.max()))
print('--- Measuring classifier performance ---')
print('Overall accuracy:', accuracy(classifier, test_set))
if __name__ == '__main__':
demo()