2013-09-23 87 views
7

我已经从sklearn框架实现了LinearSVC和SVC进行文本分类。 我正在使用TfidfVectorizer获取由两个不同类(良性数据和恶意数据)组成的输入数据的稀疏表示。这部分工作得很好,但现在我想通过使用OneClassSVM分类器来实现某种异常检测,并仅使用一个类训练模型(异常值检测...)。不幸的是,它不适用于稀疏数据。一些开发人员正在开发一个补丁(https://github.com/scikit-learn/scikit-learn/pull/1586),但有一些缺陷,所以目前还没有使用OneClassSVM实现的解决方案。[scikit学习]:异常检测 - OneClassSVM的替代方案

在sklearn框架中有没有其他方法来做类似的事情?我正在查看这些示例,但似乎没有任何结果。

谢谢!

回答

1

不幸的是,目前scikit学习implements只有一类SVM和强大的协方差估计的异常检测

可以通过检查在2D数据差异尝试这些方法(as provided in the doc)一个对比:

import numpy as np 
import pylab as pl 
import matplotlib.font_manager 
from scipy import stats 

from sklearn import svm 
from sklearn.covariance import EllipticEnvelope 

# Example settings 
n_samples = 200 
outliers_fraction = 0.25 
clusters_separation = [0, 1, 2] 

# define two outlier detection tools to be compared 
classifiers = { 
    "One-Class SVM": svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05, 
            kernel="rbf", gamma=0.1), 
    "robust covariance estimator": EllipticEnvelope(contamination=.1)} 

# Compare given classifiers under given settings 
xx, yy = np.meshgrid(np.linspace(-7, 7, 500), np.linspace(-7, 7, 500)) 
n_inliers = int((1. - outliers_fraction) * n_samples) 
n_outliers = int(outliers_fraction * n_samples) 
ground_truth = np.ones(n_samples, dtype=int) 
ground_truth[-n_outliers:] = 0 

# Fit the problem with varying cluster separation 
for i, offset in enumerate(clusters_separation): 
    np.random.seed(42) 
    # Data generation 
    X1 = 0.3 * np.random.randn(0.5 * n_inliers, 2) - offset 
    X2 = 0.3 * np.random.randn(0.5 * n_inliers, 2) + offset 
    X = np.r_[X1, X2] 
    # Add outliers 
    X = np.r_[X, np.random.uniform(low=-6, high=6, size=(n_outliers, 2))] 

    # Fit the model with the One-Class SVM 
    pl.figure(figsize=(10, 5)) 
    for i, (clf_name, clf) in enumerate(classifiers.iteritems()): 
     # fit the data and tag outliers 
     clf.fit(X) 
     y_pred = clf.decision_function(X).ravel() 
     threshold = stats.scoreatpercentile(y_pred, 
              100 * outliers_fraction) 
     y_pred = y_pred > threshold 
     n_errors = (y_pred != ground_truth).sum() 
     # plot the levels lines and the points 
     Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) 
     Z = Z.reshape(xx.shape) 
     subplot = pl.subplot(1, 2, i + 1) 
     subplot.set_title("Outlier detection") 
     subplot.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7), 
         cmap=pl.cm.Blues_r) 
     a = subplot.contour(xx, yy, Z, levels=[threshold], 
          linewidths=2, colors='red') 
     subplot.contourf(xx, yy, Z, levels=[threshold, Z.max()], 
         colors='orange') 
     b = subplot.scatter(X[:-n_outliers, 0], X[:-n_outliers, 1], c='white') 
     c = subplot.scatter(X[-n_outliers:, 0], X[-n_outliers:, 1], c='black') 
     subplot.axis('tight') 
     subplot.legend(
      [a.collections[0], b, c], 
      ['learned decision function', 'true inliers', 'true outliers'], 
      prop=matplotlib.font_manager.FontProperties(size=11)) 
     subplot.set_xlabel("%d. %s (errors: %d)" % (i + 1, clf_name, n_errors)) 
     subplot.set_xlim((-7, 7)) 
     subplot.set_ylim((-7, 7)) 
    pl.subplots_adjust(0.04, 0.1, 0.96, 0.94, 0.1, 0.26) 

pl.show()