机器学习--逻辑回归与最大熵模型理论知识(三)

机器学习 | 逻辑回归实践

今天,我们来使用python实现逻辑回归。

以下说明的是线性模型:给出一个数据集,每一个数据带有标签0或1代表他们的类别,引入sigmod函数\(\frac{1}{1+e^{-t} }\),此函数的值始终是在0到1之间。其中,\(t = \theta^Tx_i\)\(x_i\)代表某个样本,\(\theta\)是一组参数,是我们需要训练得到的。

假设p为预测得到的y的值,即\(sigmod(\theta*X_i)\),那么代价函数即为 \[ cost = y*(-log(p))+(1-y)*(-log(1-p)) \] 则可以求得: \[ \theta_j := \theta_j - \alpha\sum^m_{i = 1}(h_\theta(x^{(i)}) - y^{(i)})x_j^{(i)} \] 运用梯度下降法更新\(\theta\)参数有: \[ \theta_j = \theta_j +\frac{1}{m}\sum^m_{i = 1}(y^{(i)}+h_\theta(x^{(i)}))x_j^{(i)} \] 以下是实现过程:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import numpy as np
import math
from sklearn import datasets
from collections import Counter
infinity = float(-2**31)
def sigmodFormatrix(Xb,thetas):
params = - Xb.dot(thetas)
r = np.zeros(params.shape[0])#返回一个np数组
for i in range(len(r)):
r[i] = 1 /(1 + math.exp(params[i]))
return r

def sigmodFormatrix2(Xb,thetas):
params = - Xb.dot(thetas)
r = np.zeros(params.shape[0])#返回一个np数组
for i in range(len(r)):
r[i] = 1 /(1 + math.exp(params[i]))
if r[i] >=0.5:
r[i] = 1
else:
r[i] = 0
return r
def sigmod(Xi,thetas):
params = - np.sum(Xi * thetas)
r = 1 /(1 + math.exp(params))
return r

class LinearLogsiticRegression(object):
thetas = None
m = 0
#训练
def fit(self,X,y,alpha = 0.01,accuracy = 0.00001):
#插入第一列为1,构成xb矩阵
self.thetas = np.full(X.shape[1]+1,0.5)
self.m = X.shape[0]
a = np.full((self.m,1),1)
Xb = np.column_stack((a,X))
dimension = X.shape[1]+1
#梯度下降迭代
count = 1
while True:
oldJ = self.costFunc(Xb, y)
#注意预测函数中使用的参数是未更新的
c = sigmodFormatrix(Xb, self.thetas)-y
for j in range(dimension):
self.thetas[j] = self.thetas[j] -alpha * np.sum(c * Xb[:,j])
newJ = self.costFunc(Xb, y)
if newJ == oldJ or math.fabs(newJ - oldJ) < accuracy:
print("代价函数迭代到最小值,退出!")
print("收敛到:",newJ)
break
print("迭代第",count,"次!")
print("代价函数上一次的差:",(newJ - oldJ))
count += 1

#预测
def costFunc(self,Xb,y):
sum = 0.0
for i in range(self.m):
yPre = sigmod(Xb[i,], self.thetas)
if yPre ==1 or yPre == 0:
return infinity
sum += y[i]*math.log(yPre)+(1 - y[i])*math.log(1-yPre)
return -1/self.m * sum
def predict(self,X):
a = np.full((len(X),1),1)
Xb = np.column_stack((a,X))
return sigmodFormatrix2(Xb, self.thetas)
def score(self,X_test,y_test):
y_predict = myLogstic.predict(X_test)
re = (y_test==y_predict)
re1 = Counter(re)
a = re1[True] / (re1[True]+re1[False])
return a
#if __name__=="main":
from sklearn.model_selection import train_test_split
iris = datasets.load_iris()
X= iris['data']
y = iris['target']
X = X[y!=2]
y=y[y!=2]
X_train,X_test, y_train, y_test = train_test_split(X,y)
myLogstic = LinearLogsiticRegression()
myLogstic.fit(X_train, y_train)
y_predict = myLogstic.predict(X_test)
print("参数:",myLogstic.thetas)

print("测试数据准确度:",myLogstic.score(X_test, y_test))
print("训练数据准确度:",myLogstic.score(X_train, y_train))

使用sklearn实现如下所示:

1
2
3
4
5
from sklearn.linear_model import LogisticRegression
print("sklern中的逻辑回归:")
logr = LogisticRegression()
logr.fit(X_train,y_train)
print("准确度:",logr.score(X_test,y_test))