机器学习--支持向量机实践

Posted on 2021-06-13 Edited on 2021-06-14

机器学习--支持向量机实践

今天，我们来进行支持向量机的实践。

首先是手动实现线性支持向量机：

from cvxopt import matrix, solvers
import numpy as np 
import matplotlib.pyplot as plt 
# 使用正态分布的随机数生成
def split_train_test_data(mean1, mean2, sdt, n):
    # 生成正例数据
    np.random.seed(529)
    x_p1 = np.random.normal(loc=mean1, scale=sdt,size=int(n/2)).reshape(-1,1) 
    x_p2 = np.random.normal(loc=mean1, scale=sdt,size=int(n/2)).reshape(-1,1)
    y_p = np.ones(len(x_p1))[:,np.newaxis]
    X_p = np.hstack((x_p1, x_p2))

    # 生成负例数据
    x_n1 = np.random.normal(loc=mean2, scale=sdt,size=int(n/2)).reshape(-1,1)
    x_n2 = np.random.normal(loc=mean2, scale=sdt,size=int(n/2)).reshape(-1,1)
    y_n = (np.ones(len(x_n1))*-1)[:,np.newaxis]
    X_n = np.hstack((x_n1, x_n2))

    # 绘图
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.scatter(x_p1, x_p2, color = "#ffb07c", s = 100, label = "1")
    ax.scatter(x_n1, x_n2, color = "#c94cbe", s = 100, label = "-1")
    plt.legend()
    plt.show()

    # 整合正负例的数据与类别变量，并划分测试集与训练集
    F_train = np.vstack((X_n[:int(n/2*0.8)], X_p[:int(n/2*0.8)]))
    y_train = np.vstack((y_n[:int(n/2*0.8)], y_p[:int(n/2*0.8)]))
    F_test = np.vstack((X_n[int(n/2*0.8):], X_p[int(n/2*0.8):]))
    y_test = np.vstack((y_n[int(n/2*0.8):], y_p[int(n/2*0.8):]))

    return F_train, F_test, y_train, y_test

F_train, F_test, y_train, y_test = split_train_test_data(6,1,2,50)
print("训练集规模",F_train.shape)
print("测试集规模",F_test.shape)
print("训练集标签",y_train.shape)
print("测试集标签",y_test.shape)

def train(x, y, C):
    # 定义x内积计算
    k = []
    for i in range(x.shape[0]):
        k.append([])
        for j in range(x.shape[0]):
            k[i].append(np.inner(x[i], x[j]))
    k = np.array(k)

    #定义y的内积
    l = np.inner(y, y)

    #定义凸优化pq方法
    p = matrix(l * k)           #定义目标函数
    q = matrix(np.ones(40)*-1)  
    A = matrix(y.reshape(1,-1)) #定义等式约束
    b = matrix(0.)              
                                #定义不等式约束
    g = matrix(np.vstack((np.eye(40)*-1, np.eye(40)))) 
    h = matrix(np.vstack((np.zeros(len(y)).reshape(-1,1), np.ones(len(y)).reshape(-1,1)*C))) 
    
    #求解函数
    solution = solvers.qp(p,q,g,h,A,b)

    #获得拉格朗日系数a
    a = np.ravel(solution['x'])

    #获得最优w与b
    w_best = np.sum(a.reshape(-1,1)*y*x, axis = 0)
    b_best = 0
    for j in range(x.shape[0]):
        b_best += y[j] - np.sum(y * a.reshape(-1,1)* np.inner(x, x[j].T).reshape(-1,1))
    b_best = b_best/x.shape[0]

    return w_best, b_best

w, b = train(F_train,y_train,100)
x = np.linspace(-6, 10 , 50)
y = (-w[0]/w[1]*x - b/w[1]).ravel()
[[plt.scatter(data[0], data[1], color = "#c94cbe", s = 100)]for data in F_train[:20]]
[[plt.scatter(data[0], data[1], color = "#ffb07c", s = 100)]for data in F_train[20:]]
plt.plot(x, y, color="#087804")
def test(w, b, x):
    prediction=np.sign(np.dot(x, w)+b)
    return prediction
    
prediction = test(w, b, F_test)
num = 0
for i in range(y_test.shape[0]):
    if prediction[i] == y_test[i]:
        num += 1
acc = 100*num/(y_test.shape[0])
print("acc = %.2f %%"%acc)
x = np.linspace(-6, 10 , 50)
y = (-w[0]/w[1]*x - b/w[1]).ravel()
y1 = (-w[0]/w[1]*x - (b+1)/w[1]).ravel() #间隔线
y2 = (-w[0]/w[1]*x - (b-1)/w[1]).ravel()
[[plt.scatter(data[0], data[1], color = "#c94cbe", s=100)]for data in F_train[:20]] #训练数据
[[plt.scatter(data[0], data[1], color = "#ffb07c", s=100)]for data in F_train[20:]]
[[plt.scatter(data[0], data[1], color = "#c94cbe", marker="*", s=100)]for data in F_test[:5]] #测试数据
[[plt.scatter(data[0], data[1], color = "#ffb07c", marker="*", s=100)]for data in F_test[5:]]
plt.plot(x, y, color="#087804")
plt.plot(x, y1, color="#048243", ls='--')
plt.plot(x, y2, color="#048243", ls='--')

然后是使用sklearn实现支持向量机：

from sklearn import svm

model = svm.SVC(kernel='linear', C=1, gamma=1) 

model.fit(X, y)
model.score(X, y)

predicted= model.predict(x_test)

其中，优化机器学习算法的参数值，能有效地提高模型的性能：

1	sklearn.svm.SVC(C=1.0, kernel='rbf', degree=3, gamma=0.0, coef0=0.0, shrinking=True, probability=False,tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, random_state=None)

其中对模型性能影响较高的重要参数有“kernel”、“gamma”和“C”。

谢谢大家的观看~