基于决策树的游戏胜负预测

1 简介

本数据集来自Kaggle，包含了9879场钻一到大师段位的单双排对局，对局双方几乎是同一水平。每条数据是前10分钟的对局情况，每支队伍有19个特征，红蓝双方共38个特征。这些特征包括英雄击杀、死亡，金钱、经验、等级情况等等。一局游戏一般会持续30至40分钟，但是实际前10分钟的局面很大程度上影响了之后胜负的走向。

项目源地址：https://gitlab.diantouedu.cn/QY/test1/tree/master/%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD%E7%B3%BB%E7%BB%9F%E5%AE%9E%E6%88%98%E7%AC%AC%E4%B8%89%E6%9C%9F/%E5%AE%9E%E6%88%98%E4%BB%A3%E7%A0%81/%E6%9C%BA%E5%99%A8%E5%AD%A6%E4%B9%A0%E9%A1%B9%E7%9B%AE%E5%AE%9E%E6%88%98/%E5%9F%BA%E4%BA%8E%E5%86%B3%E7%AD%96%E6%A0%91%E7%9A%84%E8%8B%B1%E9%9B%84%E8%81%94%E7%9B%9F%E8%83%9C%E8%B4%9F%E9%A2%84%E6%B5%8B

本文在其基础上做了一定的删改和增添。

本文项目地址：https://github.com/Guoxn1/ai

2 数据预处理

包括删除第一列，编号实际上没什么用。
删除重复信息，蓝方和红方经济差就是只记录一边就行。
删除共线性较高的变量

import pandas as pd  # 数据处理
import numpy as np  # 数学运算
from collections import Counter, defaultdict
from sklearn.model_selection import train_test_split, cross_validate  # 划分数据集函数
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score  # 准确率函数
import matplotlib.pyplot as plt  # 作图
import seaborn as sns  # 作图
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score

data_df = pd.read_csv("1.csv")
data_df = data_df.drop(columns="gameId")
features =  data_df.columns[1:]
data_df1 = data_df.copy()
data_df1.info()
data_df1 = data_df1.drop(columns=["redGoldDiff",'redExperienceDiff','redCSPerMin', 'redGoldPerMin','redFirstBlood',"redDeaths","redKills"])

# 绘制热力图，删除共线性较大的变量 88%
plt.figure(figsize=(18, 14))
sns.heatmap(round(data_df1.corr(), 2), cmap='Blues', annot=True)
plt.show()

row_indices, col_indices = np.where(np.abs(data_df1.corr()) >= 0.88)
col_set= set()
for row, col in zip(row_indices, col_indices):
    if row != col:
        col_set.add(data_df1.columns[col])
        
for i in col_set:
    data_df1 = data_df1.drop(i,axis=1)
# 算出要删除的是那行
# 有点小问题
#应该这样，不然两个变量全删了 下面就不改了：
col_list = list(col_set)   
for i in range(len(col_list)):
    if i % 2==0:
        feature_df = feature_df.drop(col_list[i],axis=1)

1
2
3

plt.figure(figsize=(18, 14))
sns.heatmap(round(data_df1.corr(), 2), cmap='Blues', annot=True)
plt.show()

3 数据处理和优化参数

使用的10折交叉验证优化参数。

#  根据共线性图删除目标后  直接寻找最佳参数
from sklearn.preprocessing import StandardScaler
feature_names = data_df1.columns[1:]
all_x = data_df1[feature_names].values

all_y = data_df['blueWins'].values
scaler1 = StandardScaler()
scaler2 = StandardScaler()
x_train, x_test, y_train, y_test = train_test_split(all_x, all_y, test_size=0.2, random_state=42)
x_train = scaler1.fit_transform(x_train)
x_test =  scaler2.fit_transform(x_test)
print(x_train.shape)

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import confusion_matrix, classification_report
#  1 根据共线性图删除目标后  直接寻找最佳参数

parameters = {
    'splitter': ('best', 'random'),
    'criterion': ('gini', 'entropy'),
    'max_depth': [*range(1, 10, 1)],
}

clf = DecisionTreeClassifier(random_state=0)
GS = GridSearchCV(clf, parameters, cv=10)
GS.fit(x_train, y_train)
print("best score: ", GS.best_score_)
print("best param: ", GS.best_params_)

best_clf = DecisionTreeClassifier(
    criterion="entropy", max_depth=7, splitter="random")
best_clf.fit(x_train, y_train)
print("score:", best_clf.score(x_test, y_test))
# 输出分类报告
y_pred = best_clf.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)
print('Classification report : \n', cr)

4 手撕决策树算法

我直接copy了，以后用得着再回来看。

# 定义决策树类
class DecisionTree(object):
    def __init__(self, classes, features,
                 max_depth=10, min_samples_split=10,
                 impurity_t='entropy'):

        self.classes = classes
        self.features = features
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.impurity_t = impurity_t
        self.root = None  # 定义根节点，未训练时为空
        self.tree = defaultdict(list)

    # 要调用sklearn的cross_validate需要实现该函数返回所有参数
    def get_params(self, deep):
        return {'classes': self.classes, 'features': self.features,
                'max_depth': self.max_depth, 'min_samples_split': self.min_samples_split,
                'impurity_t': self.impurity_t}

    # 要调用sklearn的GridSearchCV需要实现该函数给类设定所有参数
    def set_params(self, **parameters):
        for parameter, value in parameters.items():
            setattr(self, parameter, value)
        return self

    def impurity(self, label):
        '''
        计算不纯度，根据传入参数计算信息熵或gini系数
        label是numpy一维数组：根据当前特征划分后的标签组成
        '''
        cnt, total = Counter(label), float(len(label))
        probs = [cnt[v] / total for v in cnt]
        if self.impurity_t == 'gini':
            return 1 - sum([p * p for p in probs])
        return -sum([p * np.log2(p) for p in probs if p > 0])

    def gain(self, feature, label) -> tuple:

        # 未分裂前的混杂度，仅仅根据标签计算
        p_impurity = self.impurity(label)

        # 记录特征的每种取值所对应的样本下标
        f_index = defaultdict(list)
        for idx, v in enumerate(feature):
            f_index[v].append(idx)

        # 根据该特征分裂后的不纯度，与特征的每种值的数目加权和
        c_impurity = 0
        for v in f_index:
            f_l = label[f_index[v]]
            c_impurity += self.impurity(f_l) * len(f_l) / len(label)

        # 计算信息增益率，即在标签无关时的不纯度
        # 防止对特征取值多的天然偏执，防止过拟合
        r = self.impurity(feature)
        r = (p_impurity - c_impurity) / (r if r != 0 else 1)
        return r, f_index

    def expand_node(self, feature, label, depth, used_features) -> tuple:

        # 1. 递归终止条件：只有一种类别无需分裂 或 达到分裂阈值，返回叶结点
        if len(set(label)) == 0:
            return label[0]
        most = Counter(label).most_common(1)[0][0]
        if depth > self.max_depth or len(label) < self.min_samples_split:
            return most

        # 2. 遍历所有未使用特征，调用gain()找到最佳分裂特征
        bestf, max_gain, bestf_idx = -1, -1, None
        for f in range(len(self.features)):
            if f in used_features:
                continue
            # 计算该特征的信息增益，和每个取值的样本下标
            f_gain, f_idx = self.gain(feature[:, f], label)
            if bestf < 0 or f_gain > max_gain:
                bestf, max_gain, bestf_idx = f, f_gain, f_idx

        # 3. 如果找不到有用的分裂特征，也结束递归
        if bestf < 0:
            return most

        # 4. 遍历特征的每种取值，递归调用expand_node进行建树，decision{特征取值：子结点}
        children = {}
        new_used_features = used_features + [bestf]
        for v in bestf_idx:
            c_idx = bestf_idx[v]
            children[v] = self.expand_node(feature[c_idx, :],
                                           label[c_idx], depth + 1, new_used_features)
        self.tree[depth].append(self.features[bestf])
        return (bestf, children, most)

    def traverse_node(self, node, feature):
        # 要求输入样本特征数和模型定义时特征数目一致
        assert len(self.features) == len(feature)
        # 已经到达叶节点，则返回分类结果
        if type(node) is not tuple:
            return node
        # 依据特征取值进入相应子节点，递归调用traverse_node，node[0]记录了特征的下标.
        fv = feature[node[0]]
        if fv in node[1]:
            return self.traverse_node(node[1][fv], feature)
        # 该特征取值在训练集中未出现过，返回训练时到达当前节点的样本中最多的类别
        return node[-1]

    def fit(self, feature, label):
        assert len(self.features) == len(
            feature[0])  # 输入数据的特征数目应该和模型定义时的特征数目相同
        # 从根节点开始分裂，模型记录根节点
        self.root = self.expand_node(
            feature, label, depth=1, used_features=[])

    def predict(self, feature):
        assert len(feature.shape) == 1 or len(feature.shape) == 2  # 只能是1维或2维
        if len(feature.shape) == 1:  # 如果是一个样本
            return self.traverse_node(self.root, feature)  # 从根节点开始路由
        # 如果是很多个样本
        return np.array([self.traverse_node(self.root, f) for f in feature])

# 定义决策树模型，传入算法参数
DT = DecisionTree(classes=[0, 1], features=feature_names,
                  max_depth=3, min_samples_split=450, impurity_t='gini')

DT.fit(x_train, y_train)  # 在训练集上训练
p_test = DT.predict(x_test)  # 在测试集上预测，获得预测值
print('pred_value ', p_test)  # 输出预测值
print('true_value ', y_test)
test_acc = accuracy_score(y_test, p_test)  # 将测试预测值与测试集标签对比获得准确率
precision = precision_score(y_test, p_test)
recall = recall_score(y_test, p_test)
f1 = f1_score(y_test, p_test)
print('\nTREE:')
for _ in DT.tree.keys():
    print('Layer' + str(_) + ':' + str(DT.tree[_]))
print('\naccuracy: {:.4f}   precision: {:.4f}   recall: {:.4f}   f1_score: {:.4f}'.format(
    test_acc, precision, recall, f1))  # 输出准确率

深度学习基础 > 机器学习

#机器学习 #分类 #决策树

基于决策树的游戏胜负预测

http://example.com/2023/10/28/基于决策树的游戏胜负预测/

作者

Guoxin

发布于

2023年10月28日

许可协议

基于神经网络的鸾尾花分类上一篇

使用贝叶斯识别垃圾邮件分类下一篇