Step 1: 擷取特徵值
- 排名
- 比賽的贏率
- Head to Head on grass
- 過去60周的比賽贏率
- Percentage of best of 5 set matches won
Step 2: 建立XGBoost模型
import numpy as np
from keras import models, layers, regularizers
from keras.models import Model
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from keras import layers, Input, metrics
from keras.models import load_model
import seaborn as sns
import matplotlib.pyplot as plt
#from utils.create_features_utils import *
sns.set_style("darkgrid")
from xgboost import XGBClassifier,XGBRegressor
df = pd.read_csv('predicting-wimbledon-matches-master/data/wimbledon_matches_with_feature.csv')
df = df.dropna()
df['diff_rank'] = df['player_0_rank'] - df['player_1_rank']
#選擇要使用的特徵值
features_list = [
'diff_rank',
'diff_match_win_percent',
'diff_games_win_percent',
'diff_5_set_match_win_percent',
'diff_close_sets_percent',
'diff_match_win_percent_grass',
'diff_games_win_percent_grass',
'diff_5_set_match_win_percent_grass',
'diff_close_sets_percent_grass',
'diff_match_win_percent_52',
'diff_games_win_percent_52',
'diff_5_set_match_win_percent_52',
'diff_close_sets_percent_52',
'diff_match_win_percent_grass_60',
'diff_games_win_percent_grass_60',
'diff_5_set_match_win_percent_grass_60',
'diff_close_sets_percent_grass_60',
'diff_match_win_percent_hh',
'diff_games_win_percent_hh',
'diff_match_win_percent_grass_hh',
'diff_games_win_percent_grass_hh']
target = df.outcome #Labels
features = df[features_list] #特徵值
#把資料分成traning set 和 test set: Train (80 %) and Test (20%)
train_features, test_features, train_target, test_target = train_test_split(features, target, test_size=0.20, random_state=1)
#將training Features轉為2D array shape=(樣本數,# of 特徵值)
#trainX_2D=trainX.reshape(df_2019_features.shape[0],trainX.shape[1]*trainX.shape[2])
xgbc = XGBRegressor()
#xgbc = XGBClassifier()
xgbc.fit(train_features, train_target)
#準備2019的FeaturesStep 3: 預測結果分析
xgbc.score(train_features, train_target)0.5595579716862888possibility=xgbc.predict(test_features)
prediction=possibility>0.5 prediction=pd.Series(prediction) prediction=prediction.astype('int')
results=pd.DataFrame({'prediction':prediction.values,'label':test_target.values})
columns=['Round', 'player_0', 'player_1','outcome']info=df.iloc[test_features.index][columns] info.head()需注意possibility的意義:
- possibility是答案為1的可能性
- (1-possibility)是答案為0的可能性,我們將預測答案是0(prediction==0)的possibility改成(1-possibility),如此一來possilibity可以變成信心程度的index,possibility越接近1表示對預測越有信心
以下我們對信心程度(possibility)做分析info['prediction']=prediction.valuesinfo['possibility']=possibilityinfo['possibility'][info['prediction']==0]=1-info['possibility'][info['prediction']==0]#答對的機率 info_pos=(info['outcome']==info['prediction']).sum()/len(info['outcome']) print('答對機率:',info_pos) #outcome==0答對的機率 info_0=info[info['outcome']==0] info_0_pos=(info_0['outcome']==info_0['prediction']).sum()/len(info_0['outcome']) print('outcome==0 答對機率:',info_0_pos) #outcome==1答對的機率 info_1=info[info['outcome']==1] info_1_pos=(info_1['outcome']==info_1['prediction']).sum()/len(info_1['outcome']) print('outcome==1 答對機率:',info_1_pos)答對機率: 0.740909090909091 outcome==0 答對機率: 0.8848484848484849 outcome==1 答對機率: 0.3090909090909091從上面分析結果得知,這個模型對於善於預測outcome==0,但對於outcome==1的答對機率卻小於自然分辨率,這也表示說這個model學到的是預測outcome==0答對的機率比較高.
回顧之前的CNN+RNN模型預測溫網的結果:
答對機率: 0.7454545454545455 outcome==0 答對機率: 0.9272727272727272 outcome==1 答對機率: 0.2我發現使用XGBoost的精確度與深度學習的訓練結果差不多
分析是否預測可能性(possibility,信心程度)越高答對的機率會越高
#答案為零時 for outcome==0 info_0=info[info['outcome']==0] bins=np.arange(0.5,1.01,0.05) possibility_group=pd.cut(info_0['possibility'],bins=bins) df_group_0=info_0.groupby(possibility_group).mean() df_group_num=info_0.groupby(possibility_group).count() g_num=df_group_num.iloc[:,0] df_group_0['# of samples']=g_num df_group_0=df_group_0[['prediction','# of samples']]#for outcome==1 info_1=info[info['outcome']==1] bins=np.arange(0.5,1.01,0.05) possibility_group=pd.cut(info_1['possibility'],bins=bins) df_group_1=info_1.groupby(possibility_group).mean() df_group_num=info_1.groupby(possibility_group).count() g_num=df_group_num.iloc[:,0] df_group_1['# of samples']=g_num df_group_1=df_group_1[['prediction','# of samples']]從以下兩張分析結果得知:
- 對於label==0的樣本,信心程度越高,預測的結果均值越接近0(但(0.9,0.95]區間只有一筆資料,預測較不準),表示信心程度越高,預測精準度越高
- 但對於label==1的樣本,信心程度越高,預測的均值卻沒有越接近1,反而變得越來越小 note: x座標為信心程度,越高表示對於預測結果越有信心.表示這個機器學到的是prediction=0得到的loss function比較小,所以不論如合機器傾向猜測結果是0.
#答案為0的樣本 fig,axis=plt.subplots(2,1,figsize=(9,5),sharex=True) df_group_0['prediction'].plot.bar(ax=axis[0]) axis[1].set_xlabel('possibility group') axis[0].set_ylabel('prediction') df_group_0['# of samples'].plot.bar(ax=axis[1]) axis[1].set_ylabel('# of samples') plt.xticks(rotation=90);#答案為1的樣本 fig,axis=plt.subplots(2,1,figsize=(9,5),sharex=True) df_group_1['prediction'].plot.bar(ax=axis[0]) axis[1].set_xlabel('possibility group') axis[0].set_ylabel('prediction') df_group_1['# of samples'].plot.bar(ax=axis[1]) axis[1].set_ylabel('# of samples') plt.xticks(rotation=90);預測2019結果
df_2019=pd.read_csv('Wimbledon2019.csv',sep=';') #2019比賽行程與選手資訊
df_2019_features=pd.read_csv('data/wimbledon_matches_with_feature_2019.csv') #2019的訓練資料
#預測結果
xgbc_predicY=xgbc.predict(df_2019_features)
#預測2019的結果, model預測出的值越接近0則0號選手贏的機率越大,反之預測結果越接近1則1好選手贏的機率較大
df_2019['probability'] =xgbc.predict(df_2019_features).flatten()
df_2019['prediction']=df_2019.apply(lambda row:round(row['probability']),axis=1)
#若預測值>0.5, 則預測的1號選手會贏,且他的贏的機率為(probability)
df_2019["probability"] = np.where(df_2019["prediction"]==0, 1-df_2019["probability"], df_2019["probability"])
#最後把預測結果表示成選手名稱
df_2019['prediction_winner']=np.where(df_2019['prediction']==0,df_2019['player_0'],df_2019['player_1'])
del df_2019['prediction']