# 基于sklearn的线性分类器logistics（对数几率回归）Python实现

Email：louhergetup@gmail.com

# 理论部分

## logistic 函数

            
x = np.linspace(start=-10,stop=10,num=1000)
y = 1 / (1 + np.exp(-x))
plt.plot(x,y)
plt.show()




## 对数几率回归

• 它是直接对分类可能性进行建模，无需事先假设数据分布，这样就避免了假设分布不准所带来的问题；
• 它不是仅预测“类别”，而是可得到近似概率预测，这对许多需利用概率辅助决策的任务很有用；
• 对率回归求解的目标函数是任意阶可导的凸函数，有很好的数学性质，现有的许多数值化算法都可直接用于求解最优解。

# 使用python实现logistics

## 实验环境

• 操作系统：win10 64
• 编程语言：Python3.7.3

## 导入Python库

            
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import re
import csv
from sklearn.feature_extraction import DictVectorizer




## 数据源

            
1,youth,hight,no,fair,no
2,youth,hight,no,excellent,no
3,middle_aged,hight,no,fair,yes
4,senior,medium,no,fair,yes
5,senior,low,yes,fair,yes
6,senior,low,yes,excellent,no
7,middle_aged,low,yes,excellent,yes
8,youth,medium,no,fair,no
9,youth,low,yes,fair,yes
10,senior,medium,yes,fair,yes
11,youth,medium,yes,excellent,yes
12,middle_aged,medium,no,excellent,yes
13,middle_aged,hight,yes,fair,yes
14,senior,medium,no,excellent,no




## 数据整理

            
future_list = [
{
"age"   : "youth",
"income": "hight",
...
}
...
]
​
answer_list = ["no", "no", "yes", ...]




            

future_list = []
label_list = []

label_list.append(row[-1])
row_dict = {}
for i in range(1, len(row) - 1):
future_list.append(row_dict)
data_file.close()




## 随机变量向量化

youth middle_aged senior
1 0 0

age=middle_aged age=senior age=youth credit_rating=excellent credit_rating=fair credit_rating=fair income=low income=medium student=no student=yes
0 0 1 0 1 1 0 0 1 0

## 特征向量化

            
vec = DictVectorizer()
dummy_x = vec.fit_transform(future_list).toarray()

print("dummy_x:", dummy_x)
print("vec.get_feature_names()", vec.get_feature_names())




            
dummy_x: [[0. 0. 1. 0. 1. 1. 0. 0. 1. 0.]
[0. 0. 1. 1. 0. 1. 0. 0. 1. 0.]
[1. 0. 0. 0. 1. 1. 0. 0. 1. 0.]
[0. 1. 0. 0. 1. 0. 0. 1. 1. 0.]
[0. 1. 0. 0. 1. 0. 1. 0. 0. 1.]
[0. 1. 0. 1. 0. 0. 1. 0. 0. 1.]
[1. 0. 0. 1. 0. 0. 1. 0. 0. 1.]
[0. 0. 1. 0. 1. 0. 0. 1. 1. 0.]
[0. 0. 1. 0. 1. 0. 1. 0. 0. 1.]
[0. 1. 0. 0. 1. 0. 0. 1. 0. 1.]
[0. 0. 1. 1. 0. 0. 0. 1. 0. 1.]
[1. 0. 0. 1. 0. 0. 0. 1. 1. 0.]
[1. 0. 0. 0. 1. 1. 0. 0. 0. 1.]
[0. 1. 0. 1. 0. 0. 0. 1. 1. 0.]]




## 分类结果向量化

            
from sklearn import preprocessing
lb = preprocessing.LabelBinarizer()
dummy_y = lb.fit_transform(label_list)




## 模型建立

            
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()




## 模型训练

            
lr.fit(dummy_x, dummy_y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
verbose=0, warm_start=False)




## 模型测试

            
from sklearn.metrics import classification_report
#测试数据
first_row = dummy_x[0, :]
new_row = list(first_row)
new_row[0] = 1
new_row[2] = 0

print(lr.score(dummy_x,dummy_y))
y_result = lr.predict(dummy_x)
print(classification_report(dummy_y, y_result, target_names=['NO', 'YES']))



            
0.8571428571428571
precision    recall  f1-score   support

NO       1.00      0.60      0.75         5
YES       0.82      1.00      0.90         9

micro avg       0.86      0.86      0.86        14
macro avg       0.91      0.80      0.82        14
weighted avg       0.88      0.86      0.85        14



• classification_report()用于测试准确率，精确率和召回率
• .score()用于评估本模型的准确率

QQ号联系： 360901061

【本文对您有帮助就好】