基于决策树的游戏胜负预测

1 简介

本数据集来自Kaggle,包含了9879场钻一到大师段位的单双排对局,对局双方几乎是同一水平。每条数据是前10分钟的对局情况,每支队伍有19个特征,红蓝双方共38个特征。这些特征包括英雄击杀、死亡,金钱、经验、等级情况等等。一局游戏一般会持续30至40分钟,但是实际前10分钟的局面很大程度上影响了之后胜负的走向。

项目源地址:https://gitlab.diantouedu.cn/QY/test1/tree/master/%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD%E7%B3%BB%E7%BB%9F%E5%AE%9E%E6%88%98%E7%AC%AC%E4%B8%89%E6%9C%9F/%E5%AE%9E%E6%88%98%E4%BB%A3%E7%A0%81/%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0%E9%A1%B9%E7%9B%AE%E5%AE%9E%E6%88%98/%E5%9F%BA%E4%BA%8E%E5%86%B3%E7%AD%96%E6%A0%91%E7%9A%84%E8%8B%B1%E9%9B%84%E8%81%94%E7%9B%9F%E8%83%9C%E8%B4%9F%E9%A2%84%E6%B5%8B

本文在其基础上做了一定的删改和增添。

本文项目地址:https://github.com/Guoxn1/ai

2 数据预处理

  1. 包括删除第一列,编号实际上没什么用。

  2. 删除重复信息,蓝方和红方经济差就是只记录一边就行。

  3. 删除共线性较高的变量

1
2
3
4
5
6
7
8
9
10
11
import pandas as pd  # 数据处理
import numpy as np # 数学运算
from collections import Counter, defaultdict
from sklearn.model_selection import train_test_split, cross_validate # 划分数据集函数
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score # 准确率函数
import matplotlib.pyplot as plt # 作图
import seaborn as sns # 作图
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score

1
2
3
4
5
6
data_df = pd.read_csv("1.csv")
data_df = data_df.drop(columns="gameId")
features = data_df.columns[1:]
data_df1 = data_df.copy()
data_df1.info()
data_df1 = data_df1.drop(columns=["redGoldDiff",'redExperienceDiff','redCSPerMin', 'redGoldPerMin','redFirstBlood',"redDeaths","redKills"])
1
2
3
4
5
# 绘制热力图,删除共线性较大的变量 88%
plt.figure(figsize=(18, 14))
sns.heatmap(round(data_df1.corr(), 2), cmap='Blues', annot=True)
plt.show()

image-20231028212714943
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
row_indices, col_indices = np.where(np.abs(data_df1.corr()) >= 0.88)
col_set= set()
for row, col in zip(row_indices, col_indices):
if row != col:
col_set.add(data_df1.columns[col])

for i in col_set:
data_df1 = data_df1.drop(i,axis=1)
# 算出要删除的是那行
# 有点小问题
#应该这样,不然两个变量全删了 下面就不改了:
col_list = list(col_set)
for i in range(len(col_list)):
if i % 2==0:
feature_df = feature_df.drop(col_list[i],axis=1)

1
2
3
plt.figure(figsize=(18, 14))
sns.heatmap(round(data_df1.corr(), 2), cmap='Blues', annot=True)
plt.show()
image-20231028212752472

3 数据处理和优化参数

使用的10折交叉验证优化参数。

1
2
3
4
5
6
7
8
9
10
11
12
#  根据共线性图删除目标后  直接寻找最佳参数
from sklearn.preprocessing import StandardScaler
feature_names = data_df1.columns[1:]
all_x = data_df1[feature_names].values

all_y = data_df['blueWins'].values
scaler1 = StandardScaler()
scaler2 = StandardScaler()
x_train, x_test, y_train, y_test = train_test_split(all_x, all_y, test_size=0.2, random_state=42)
x_train = scaler1.fit_transform(x_train)
x_test = scaler2.fit_transform(x_test)
print(x_train.shape)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
# 1 根据共线性图删除目标后 直接寻找最佳参数

parameters = {
'splitter': ('best', 'random'),
'criterion': ('gini', 'entropy'),
'max_depth': [*range(1, 10, 1)],
}

clf = DecisionTreeClassifier(random_state=0)
GS = GridSearchCV(clf, parameters, cv=10)
GS.fit(x_train, y_train)
print("best score: ", GS.best_score_)
print("best param: ", GS.best_params_)
1
2
3
4
5
6
7
8
9
best_clf = DecisionTreeClassifier(
criterion="entropy", max_depth=7, splitter="random")
best_clf.fit(x_train, y_train)
print("score:", best_clf.score(x_test, y_test))
# 输出分类报告
y_pred = best_clf.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)
print('Classification report : \n', cr)

4 手撕决策树算法

我直接copy了,以后用得着再回来看。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
# 定义决策树类
class DecisionTree(object):
def __init__(self, classes, features,
max_depth=10, min_samples_split=10,
impurity_t='entropy'):

self.classes = classes
self.features = features
self.max_depth = max_depth
self.min_samples_split = min_samples_split
self.impurity_t = impurity_t
self.root = None # 定义根节点,未训练时为空
self.tree = defaultdict(list)

# 要调用sklearn的cross_validate需要实现该函数返回所有参数
def get_params(self, deep):
return {'classes': self.classes, 'features': self.features,
'max_depth': self.max_depth, 'min_samples_split': self.min_samples_split,
'impurity_t': self.impurity_t}

# 要调用sklearn的GridSearchCV需要实现该函数给类设定所有参数
def set_params(self, **parameters):
for parameter, value in parameters.items():
setattr(self, parameter, value)
return self

def impurity(self, label):
'''
计算不纯度,根据传入参数计算信息熵或gini系数
label是numpy一维数组:根据当前特征划分后的标签组成
'''
cnt, total = Counter(label), float(len(label))
probs = [cnt[v] / total for v in cnt]
if self.impurity_t == 'gini':
return 1 - sum([p * p for p in probs])
return -sum([p * np.log2(p) for p in probs if p > 0])

def gain(self, feature, label) -> tuple:

# 未分裂前的混杂度,仅仅根据标签计算
p_impurity = self.impurity(label)

# 记录特征的每种取值所对应的样本下标
f_index = defaultdict(list)
for idx, v in enumerate(feature):
f_index[v].append(idx)

# 根据该特征分裂后的不纯度,与特征的每种值的数目加权和
c_impurity = 0
for v in f_index:
f_l = label[f_index[v]]
c_impurity += self.impurity(f_l) * len(f_l) / len(label)

# 计算信息增益率,即在标签无关时的不纯度
# 防止对特征取值多的天然偏执,防止过拟合
r = self.impurity(feature)
r = (p_impurity - c_impurity) / (r if r != 0 else 1)
return r, f_index

def expand_node(self, feature, label, depth, used_features) -> tuple:

# 1. 递归终止条件:只有一种类别无需分裂 或 达到分裂阈值,返回叶结点
if len(set(label)) == 0:
return label[0]
most = Counter(label).most_common(1)[0][0]
if depth > self.max_depth or len(label) < self.min_samples_split:
return most

# 2. 遍历所有未使用特征,调用gain()找到最佳分裂特征
bestf, max_gain, bestf_idx = -1, -1, None
for f in range(len(self.features)):
if f in used_features:
continue
# 计算该特征的信息增益,和每个取值的样本下标
f_gain, f_idx = self.gain(feature[:, f], label)
if bestf < 0 or f_gain > max_gain:
bestf, max_gain, bestf_idx = f, f_gain, f_idx

# 3. 如果找不到有用的分裂特征,也结束递归
if bestf < 0:
return most

# 4. 遍历特征的每种取值,递归调用expand_node进行建树,decision{特征取值:子结点}
children = {}
new_used_features = used_features + [bestf]
for v in bestf_idx:
c_idx = bestf_idx[v]
children[v] = self.expand_node(feature[c_idx, :],
label[c_idx], depth + 1, new_used_features)
self.tree[depth].append(self.features[bestf])
return (bestf, children, most)

def traverse_node(self, node, feature):
# 要求输入样本特征数和模型定义时特征数目一致
assert len(self.features) == len(feature)
# 已经到达叶节点,则返回分类结果
if type(node) is not tuple:
return node
# 依据特征取值进入相应子节点,递归调用traverse_node,node[0]记录了特征的下标.
fv = feature[node[0]]
if fv in node[1]:
return self.traverse_node(node[1][fv], feature)
# 该特征取值在训练集中未出现过,返回训练时到达当前节点的样本中最多的类别
return node[-1]

def fit(self, feature, label):
assert len(self.features) == len(
feature[0]) # 输入数据的特征数目应该和模型定义时的特征数目相同
# 从根节点开始分裂,模型记录根节点
self.root = self.expand_node(
feature, label, depth=1, used_features=[])

def predict(self, feature):
assert len(feature.shape) == 1 or len(feature.shape) == 2 # 只能是1维或2维
if len(feature.shape) == 1: # 如果是一个样本
return self.traverse_node(self.root, feature) # 从根节点开始路由
# 如果是很多个样本
return np.array([self.traverse_node(self.root, f) for f in feature])
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
# 定义决策树模型,传入算法参数
DT = DecisionTree(classes=[0, 1], features=feature_names,
max_depth=3, min_samples_split=450, impurity_t='gini')

DT.fit(x_train, y_train) # 在训练集上训练
p_test = DT.predict(x_test) # 在测试集上预测,获得预测值
print('pred_value ', p_test) # 输出预测值
print('true_value ', y_test)
test_acc = accuracy_score(y_test, p_test) # 将测试预测值与测试集标签对比获得准确率
precision = precision_score(y_test, p_test)
recall = recall_score(y_test, p_test)
f1 = f1_score(y_test, p_test)
print('\nTREE:')
for _ in DT.tree.keys():
print('Layer' + str(_) + ':' + str(DT.tree[_]))
print('\naccuracy: {:.4f} precision: {:.4f} recall: {:.4f} f1_score: {:.4f}'.format(
test_acc, precision, recall, f1)) # 输出准确率

基于决策树的游戏胜负预测
http://example.com/2023/10/28/基于决策树的游戏胜负预测/
作者
Guoxin
发布于
2023年10月28日
许可协议