2019年7月13日 星期六

[資料回歸分析] 製作溫布敦網球賽預測用的訓練樣本與2019年比賽的測試樣本

from datetime import datetime
from datetime import timedelta
import pandas as pd
from datetime import datetime

#取得排名較高的球員姓名
def get_player_1_name(winner_name, winner_rank, loser_name, loser_rank):
    """
    :param winner_name: Name of winner
    :param winner_rank: Rank of the winner
    :param loser_name: Name of loser
    :param loser_rank: Rank of the loser
    :return: name of higher ranked player
    """
    if winner_rank < loser_rank:
        return winner_name
    else:
        return loser_name
#取得排名較高的球員名次
def get_player_1_rank(winner_rank, loser_rank):
    """
    :param winner_rank: Rank of the winner
    :param loser_rank: Rank of the loser
    :return: rank of higher ranked player
    """
    if winner_rank < loser_rank:
        return winner_rank
    else:
        return loser_rank
#取得排名較低的球員姓名
def get_player_2_name(winner_name, winner_rank, loser_name, loser_rank):
    """
    :param winner_name: Name of winner
    :param winner_rank: Rank of the winner
    :param loser_name: Name of loser
    :param loser_rank: Rank of the loser
    :return: name of lower ranked player
    """
    if winner_rank > loser_rank:
        return winner_name
    else:
        return loser_name
#取得排名較低的球員名次
def get_player_2_rank(winner_rank, loser_rank):
    """
    :param winner_rank: Rank of the winner
    :param loser_rank: Rank of the loser
    :return: rank of lower ranked player
    """
    if winner_rank > loser_rank:
        return winner_rank
    else:
        return loser_rank
#取得結果: 若排明較高的贏則outcome=0,若排名較低的贏則outcome=1
def outcome(winner_rank, loser_rank):
    """
    Returns 0 if higher ranked player won and 0 otherwise
    :param winner_rank: Rank of the winner
    :param loser_rank: Rank of the loser
    :return: Odds of the Higher ranked player
    """
    if winner_rank< loser_rank:
        return 0
    else:
        return 1
#取得排名較高的贏球機率
def get_player_1_odd(winner_odd, winner_rank, loser_odd, loser_rank):
    """
    Returns the odds of Higher ranked player
    :param winner_odd: Odds of the winner
    :param winner_rank: Rank of the winner
    :param loser_odd: Odds of the Loser
    :param loser_rank: Rank of the Loser
    :return: Odds of the Higher ranked player
    """
    if winner_rank < loser_rank:
        return winner_odd
    else:
        return loser_odd
#取得排名較低的贏球機率
def get_player_2_odd(winner_odd, winner_rank, loser_odd, loser_rank):
    """
    Returns the odds of Lower ranked player
    :param winner_odd:
    :param winner_rank:
    :param loser_odd:
    :param loser_rank:
    :return:
    """
    if winner_rank > loser_rank:
        return winner_odd
    else:
        return loser_odd

#取得date_string前num_days日的的日期
def subtract_days(date_string, num_days):
    """
    Subtract n days from a specified date
    :param date_string: pass date in format '%Y/%m/%d'
    :param num_days: Number of days to be subtracting from the date
    :return: date in format '%Y/%m/%d'
    """
    date_temp = (datetime.strptime(date_string, '%Y/%m/%d') - timedelta(days=num_days))
    return date_temp.strftime("%Y/%m/%d")

#取得球員的統計資料
def winning_percentage(player_id, data,  type1='matches', current_date=None, surface='All', last_n_weeks=0):
    """
    Caculate different player stats
    :param player_id: Name or ID of Player
    :param data: The raw dataframe from http://www.tennis-data.co.uk/alldata.php
    :param type1: Options: ['matches', 'total_matches', 'games', 'matches_5_sets', 'win_or_close_sets']
    :param current_date: Date of match
    :param surface: Surface options: ['All', 'Grass', 'Hard', 'Clay']
    :param last_n_weeks: Get stats from the past n weeks
    :return: Returns the players Stat for the specified parameters.
    """
    data = data[data['Date'] < current_date]

    if surface!='All':
        data = data[data['Surface'] == surface]

    if last_n_weeks>0:
        last_date = subtract_days(current_date, (last_n_weeks * 7))
        data = data[data['Date'] >= last_date]

    if type1 == 'matches':
        wins = (data['Winner'] == player_id).sum()
        loses = (data['Loser'] == player_id).sum()

    elif type1 == 'total_matches':
        return (data['Winner'] == player_id).sum() + (data['Loser'] == player_id).sum()


    elif type1 == 'matches_5_sets':
        wins = ((data['Winner'] == player_id) & (data['best_of_5'] == 1)).sum()
        loses = ((data['Loser'] == player_id) & (data['best_of_5'] == 1)).sum()


    elif type1 == 'games':
        winner_set_list = ['W1', 'W2', 'W3', 'W4', 'W5']
        loser_set_list = ['L1', 'L2', 'L3', 'L4', 'L5']

        wins = data[data['Winner'] == player_id][winner_set_list].values.sum() + data[data['Loser'] == player_id][loser_set_list].values.sum()
        loses = data[data['Loser'] == player_id][winner_set_list].values.sum() + data[data['Winner'] == player_id][loser_set_list].values.sum()


    elif type1 == 'win_or_close_sets':

        wins = 0
        loses = 0

        data_3_set = data[data['best_of_5'] == 0]
        data_5_set = data[data['best_of_5'] == 1]

        for i in range(1, 4):
            wins = wins + ((data_3_set['Winner'] == player_id) & (data_3_set[('W' + str(i))] >= 5)).sum()
            wins = wins + ((data_3_set['Loser'] == player_id) & (data_3_set[('L' + str(i))] >= 5)).sum()
            loses = loses + ((data_3_set['Winner'] == player_id) & (data_3_set[('W' + str(i))] < 5)).sum()
            loses = loses + ((data_3_set['Loser'] == player_id) & (data_3_set[('L' + str(i))] < 5)).sum()

        for i in range(1, 6):
            wins = wins + ((data_5_set['Winner'] == player_id) & (data_5_set[('W' + str(i))] >= 5)).sum()
            wins = wins + ((data_5_set['Loser'] == player_id) & (data_5_set[('L' + str(i))] >= 5)).sum()
            loses = loses + ((data_5_set['Winner'] == player_id) & (data_5_set[('W' + str(i))] < 5)).sum()
            loses = loses + ((data_5_set['Loser'] == player_id) & (data_5_set[('L' + str(i))] < 5)).sum()

    total = wins + loses

    if total <2:
        win_percent = 0

    else:
        win_percent = wins / total
    return win_percent

#取得一對一的贏球機率
def winning_percent_hh(player_name, opponent_name, data, type1='matches', current_date=None, surface='All', last_n_weeks=0):
    """
    :param player_name: Name of player
    :param opponent_name: Name of opponent
    :param data: The raw dataframe from http://www.tennis-data.co.uk/alldata.php
    :param type1: Options: ['matches', 'games', ]
    :param current_date: Date of the match
    :param surface: Surface options: ['All', 'Grass', 'Hard', 'Clay']
    :param last_n_weeks: Get stats from the past n weeks
    :return: Returns the players Head to Head Stat.
    """
    data = data[data['Date'] < current_date]

    if surface!='All':
        data = data[data['Surface'] == surface]

    if last_n_weeks>0:
        last_date = subtract_days(current_date, (last_n_weeks * 7))
        data = data[data['Date'] >= last_date]

    if type1 == 'matches':
        wins = ((data['Winner'] == player_name) & (data['Loser'] == opponent_name)).sum()
        loses = ((data['Winner'] == opponent_name) & (data['Loser'] == player_name)).sum()

    elif type1 == 'games':
        winner_set_list = ['W1', 'W2', 'W3', 'W4', 'W5']
        loser_set_list = ['L1', 'L2', 'L3', 'L4', 'L5']

        wins = data[(data['Winner'] == player_name) & (data['Loser'] == opponent_name)][winner_set_list].values.sum() + \
               data[(data['Winner'] == opponent_name) & (data['Loser'] == player_name)][loser_set_list].values.sum()

        loses = data[(data['Winner'] == opponent_name) & (data['Loser'] == player_name)][winner_set_list].values.sum() + \
                data[(data['Winner'] == player_name) & (data['Loser'] == opponent_name)][loser_set_list].values.sum()

    total = wins + loses

    if total == 0:
        win_percent = 0

    else:
        win_percent = wins / total
    return win_percent

def create_features(df_combined, df):
    """
    :param df_combined: All matched with player_0 as higher ranked player
    :param df: The raw dataframe from http://www.tennis-data.co.uk/alldata.php
    :return: A data_frame with player features for each match
    """

    # **************************************
    # Player Career Stats All Surface
    # **************************************
    print('Creating Player Career Stats All Surface')

    df_combined.loc[:, 'player_0_match_win_percent'] = df_combined.apply(
        lambda row: winning_percentage(row['player_0'], df, type1='matches', current_date=row['Date'], last_n_weeks=0),
        axis=1)
    df_combined.loc[:, 'player_1_match_win_percent'] = df_combined.apply(
        lambda row: winning_percentage(row['player_1'], df, type1='matches', current_date=row['Date'], last_n_weeks=0),
        axis=1)

    df_combined.loc[:, 'player_0_games_win_percent'] = df_combined.apply(
        lambda row: winning_percentage(row['player_0'], df, type1='games', current_date=row['Date'], last_n_weeks=0),
        axis=1)
    df_combined.loc[:, 'player_1_games_win_percent'] = df_combined.apply(
        lambda row: winning_percentage(row['player_1'], df, type1='games', current_date=row['Date'], last_n_weeks=0),
        axis=1)

    df_combined.loc[:, 'player_0_5_set_match_win_percent'] = df_combined.apply(
        lambda row: winning_percentage(row['player_0'], df, type1='matches_5_sets', current_date=row['Date'],
                                       last_n_weeks=0), axis=1)
    df_combined.loc[:, 'player_1_5_set_match_win_percent'] = df_combined.apply(
        lambda row: winning_percentage(row['player_1'], df, type1='matches_5_sets', current_date=row['Date'],
                                       last_n_weeks=0), axis=1)

    df_combined.loc[:, 'player_0_close_sets_percent'] = df_combined.apply(
        lambda row: winning_percentage(row['player_0'], df, type1='win_or_close_sets', current_date=row['Date'],
                                       last_n_weeks=0), axis=1)
    df_combined.loc[:, 'player_1_close_sets_percent'] = df_combined.apply(
        lambda row: winning_percentage(row['player_1'], df, type1='win_or_close_sets', current_date=row['Date'],
                                       last_n_weeks=0), axis=1)

    # **************************************
    # Player Career Stats on Grass/Clay/Hard
    # **************************************

    print('Creating Player Career Stats on Grass/Clay/Hard')

    df_combined.loc[:, 'player_0_match_win_percent_grass'] = df_combined.apply(
        lambda row: winning_percentage(row['player_0'], df, type1='matches', current_date=row['Date'],
                                       surface=row['Surface'], last_n_weeks=0), axis=1)
    df_combined.loc[:, 'player_1_match_win_percent_grass'] = df_combined.apply(
        lambda row: winning_percentage(row['player_1'], df, type1='matches', current_date=row['Date'],
                                       surface=row['Surface'], last_n_weeks=0), axis=1)

    df_combined.loc[:, 'player_0_games_win_percent_grass'] = df_combined.apply(
        lambda row: winning_percentage(row['player_0'], df, type1='games', current_date=row['Date'],
                                       surface=row['Surface'], last_n_weeks=0), axis=1)
    df_combined.loc[:, 'player_1_games_win_percent_grass'] = df_combined.apply(
        lambda row: winning_percentage(row['player_1'], df, type1='games', current_date=row['Date'],
                                       surface=row['Surface'], last_n_weeks=0), axis=1)

    df_combined.loc[:, 'player_0_5_set_match_win_percent_grass'] = df_combined.apply(
        lambda row: winning_percentage(row['player_0'], df, type1='matches_5_sets', current_date=row['Date'],
                                       surface=row['Surface'], last_n_weeks=0), axis=1)
    df_combined.loc[:, 'player_1_5_set_match_win_percent_grass'] = df_combined.apply(
        lambda row: winning_percentage(row['player_1'], df, type1='matches_5_sets', current_date=row['Date'],
                                       surface=row['Surface'], last_n_weeks=0), axis=1)

    df_combined.loc[:, 'player_0_close_sets_percent_grass'] = df_combined.apply(
        lambda row: winning_percentage(row['player_0'], df, type1='win_or_close_sets', current_date=row['Date'],
                                       surface=row['Surface'], last_n_weeks=0), axis=1)
    df_combined.loc[:, 'player_1_close_sets_percent_grass'] = df_combined.apply(
        lambda row: winning_percentage(row['player_1'], df, type1='win_or_close_sets', current_date=row['Date'],
                                       surface=row['Surface'], last_n_weeks=0), axis=1)

    # **************************************
    # Player Career Stats All Surface Last 52 Weeks
    # **************************************

    print('Creating Player Career Stats All Surface Last 52 Weeks')

    df_combined.loc[:, 'player_0_match_win_percent_52'] = df_combined.apply(
        lambda row: winning_percentage(row['player_0'], df, type1='matches', current_date=row['Date'], last_n_weeks=52),
        axis=1)
    df_combined.loc[:, 'player_1_match_win_percent_52'] = df_combined.apply(
        lambda row: winning_percentage(row['player_1'], df, type1='matches', current_date=row['Date'], last_n_weeks=52),
        axis=1)

    df_combined.loc[:, 'player_0_games_win_percent_52'] = df_combined.apply(
        lambda row: winning_percentage(row['player_0'], df, type1='games', current_date=row['Date'], last_n_weeks=52),
        axis=1)
    df_combined.loc[:, 'player_1_games_win_percent_52'] = df_combined.apply(
        lambda row: winning_percentage(row['player_1'], df, type1='games', current_date=row['Date'], last_n_weeks=52),
        axis=1)

    df_combined.loc[:, 'player_0_5_set_match_win_percent_52'] = df_combined.apply(
        lambda row: winning_percentage(row['player_0'], df, type1='matches_5_sets', current_date=row['Date'],
                                       last_n_weeks=52), axis=1)
    df_combined.loc[:, 'player_1_5_set_match_win_percent_52'] = df_combined.apply(
        lambda row: winning_percentage(row['player_1'], df, type1='matches_5_sets', current_date=row['Date'],
                                       last_n_weeks=52), axis=1)

    df_combined.loc[:, 'player_0_close_sets_percent_52'] = df_combined.apply(
        lambda row: winning_percentage(row['player_0'], df, type1='win_or_close_sets', current_date=row['Date'],
                                       last_n_weeks=52), axis=1)
    df_combined.loc[:, 'player_1_close_sets_percent_52'] = df_combined.apply(
        lambda row: winning_percentage(row['player_1'], df, type1='win_or_close_sets', current_date=row['Date'],
                                       last_n_weeks=52), axis=1)

    # **************************************
    # Player Career Stats on Grass/Clay/Hard Last 60 Weeks
    # **************************************

    print('Creating Player Career Stats on Grass/Clay/Hard Last 60 Weeks')

    df_combined.loc[:, 'player_0_match_win_percent_grass_60'] = df_combined.apply(
        lambda row: winning_percentage(row['player_0'], df, type1='matches', current_date=row['Date'],
                                       surface=row['Surface'], last_n_weeks=60), axis=1)
    df_combined.loc[:, 'player_1_match_win_percent_grass_60'] = df_combined.apply(
        lambda row: winning_percentage(row['player_1'], df, type1='matches', current_date=row['Date'],
                                       surface=row['Surface'], last_n_weeks=60), axis=1)

    df_combined.loc[:, 'player_0_games_win_percent_grass_60'] = df_combined.apply(
        lambda row: winning_percentage(row['player_0'], df, type1='games', current_date=row['Date'],
                                       surface=row['Surface'], last_n_weeks=60), axis=1)
    df_combined.loc[:, 'player_1_games_win_percent_grass_60'] = df_combined.apply(
        lambda row: winning_percentage(row['player_1'], df, type1='games', current_date=row['Date'],
                                       surface=row['Surface'], last_n_weeks=60), axis=1)

    df_combined.loc[:, 'player_0_5_set_match_win_percent_grass_60'] = df_combined.apply(
        lambda row: winning_percentage(row['player_0'], df, type1='matches_5_sets', current_date=row['Date'],
                                       surface=row['Surface'], last_n_weeks=60), axis=1)
    df_combined.loc[:, 'player_1_5_set_match_win_percent_grass_60'] = df_combined.apply(
        lambda row: winning_percentage(row['player_1'], df, type1='matches_5_sets', current_date=row['Date'],
                                       surface=row['Surface'], last_n_weeks=60), axis=1)

    df_combined.loc[:, 'player_0_close_sets_percent_grass_60'] = df_combined.apply(
        lambda row: winning_percentage(row['player_0'], df, type1='win_or_close_sets', current_date=row['Date'],
                                       surface=row['Surface'], last_n_weeks=60), axis=1)
    df_combined.loc[:, 'player_1_close_sets_percent_grass_60'] = df_combined.apply(
        lambda row: winning_percentage(row['player_1'], df, type1='win_or_close_sets', current_date=row['Date'],
                                       surface=row['Surface'], last_n_weeks=60), axis=1)

    # **************************************
    # Player Head to Head Career Stats All Surface
    # **************************************

    print('Creating Player Head to Head Career Stats All Surface')

    df_combined.loc[:, 'player_0_match_win_percent_hh'] = df_combined.apply(
        lambda row: winning_percent_hh(row['player_0'], row['player_1'], df, type1='matches', current_date=row['Date'],
                                       last_n_weeks=0), axis=1)
    df_combined.loc[:, 'player_1_match_win_percent_hh'] = df_combined.apply(
        lambda row: winning_percent_hh(row['player_1'], row['player_0'], df, type1='matches', current_date=row['Date'],
                                       last_n_weeks=0), axis=1)

    df_combined.loc[:, 'player_0_games_win_percent_hh'] = df_combined.apply(
        lambda row: winning_percent_hh(row['player_0'], row['player_1'], df, type1='games', current_date=row['Date'],
                                       last_n_weeks=0), axis=1)
    df_combined.loc[:, 'player_1_games_win_percent_hh'] = df_combined.apply(
        lambda row: winning_percent_hh(row['player_1'], row['player_0'], df, type1='games', current_date=row['Date'],
                                       last_n_weeks=0), axis=1)

    # **************************************
    # Player Head to Head Career Stats On Grass
    # **************************************

    print('Creating Player Head to Head Career Stats On Grass')

    df_combined.loc[:, 'player_0_match_win_percent_grass_hh'] = df_combined.apply(
        lambda row: winning_percent_hh(row['player_0'], row['player_1'], df, type1='matches', current_date=row['Date'],
                                       last_n_weeks=0), axis=1)
    df_combined.loc[:, 'player_1_match_win_percent_grass_hh'] = df_combined.apply(
        lambda row: winning_percent_hh(row['player_1'], row['player_0'], df, type1='matches', current_date=row['Date'],
                                       last_n_weeks=0), axis=1)

    df_combined.loc[:, 'player_0_games_win_percent_grass_hh'] = df_combined.apply(
        lambda row: winning_percent_hh(row['player_0'], row['player_1'], df, type1='games', current_date=row['Date'],
                                       last_n_weeks=0), axis=1)
    df_combined.loc[:, 'player_1_games_win_percent_grass_hh'] = df_combined.apply(
        lambda row: winning_percent_hh(row['player_1'], row['player_0'], df, type1='games', current_date=row['Date'],
                                       last_n_weeks=0), axis=1)

    # **************************************
    # Difference variables
    # **************************************

    print('Creating Difference Variables')

    df_combined.loc[:, 'diff_match_win_percent'] = df_combined['player_0_match_win_percent'] - df_combined[
        'player_1_match_win_percent']
    df_combined.loc[:, 'diff_games_win_percent'] = df_combined['player_0_games_win_percent'] - df_combined[
        'player_1_games_win_percent']
    df_combined.loc[:, 'diff_5_set_match_win_percent'] = df_combined['player_0_5_set_match_win_percent'] - df_combined[
        'player_1_5_set_match_win_percent']
    df_combined.loc[:, 'diff_close_sets_percent'] = df_combined['player_0_close_sets_percent'] - df_combined[
        'player_1_close_sets_percent']

    df_combined.loc[:, 'diff_match_win_percent_grass'] = df_combined['player_0_match_win_percent_grass'] - df_combined[
        'player_1_match_win_percent_grass']
    df_combined.loc[:, 'diff_games_win_percent_grass'] = df_combined['player_0_games_win_percent_grass'] - df_combined[
        'player_1_games_win_percent_grass']
    df_combined.loc[:, 'diff_5_set_match_win_percent_grass'] = df_combined['player_0_5_set_match_win_percent_grass'] - \
                                                               df_combined['player_1_5_set_match_win_percent_grass']
    df_combined.loc[:, 'diff_close_sets_percent_grass'] = df_combined['player_0_close_sets_percent_grass'] - \
                                                          df_combined['player_1_close_sets_percent_grass']

    df_combined.loc[:, 'diff_match_win_percent_52'] = df_combined['player_0_match_win_percent_52'] - df_combined[
        'player_1_match_win_percent_52']
    df_combined.loc[:, 'diff_games_win_percent_52'] = df_combined['player_0_games_win_percent_52'] - df_combined[
        'player_1_games_win_percent_52']
    df_combined.loc[:, 'diff_5_set_match_win_percent_52'] = df_combined['player_0_5_set_match_win_percent_52'] - \
                                                            df_combined['player_1_5_set_match_win_percent_52']
    df_combined.loc[:, 'diff_close_sets_percent_52'] = df_combined['player_0_close_sets_percent_52'] - df_combined[
        'player_1_close_sets_percent_52']

    df_combined.loc[:, 'diff_match_win_percent_grass_60'] = df_combined['player_0_match_win_percent_grass_60'] - \
                                                            df_combined['player_1_match_win_percent_grass_60']
    df_combined.loc[:, 'diff_games_win_percent_grass_60'] = df_combined['player_0_games_win_percent_grass_60'] - \
                                                            df_combined['player_1_games_win_percent_grass_60']
    df_combined.loc[:, 'diff_5_set_match_win_percent_grass_60'] = df_combined[
                                                                      'player_0_5_set_match_win_percent_grass_60'] - \
                                                                  df_combined[
                                                                      'player_1_5_set_match_win_percent_grass_60']
    df_combined.loc[:, 'diff_close_sets_percent_grass_60'] = df_combined['player_0_close_sets_percent_grass_60'] - \
                                                             df_combined['player_1_close_sets_percent_grass_60']

    df_combined.loc[:, 'diff_match_win_percent_hh'] = df_combined['player_0_match_win_percent_hh'] - df_combined[
        'player_1_match_win_percent_hh']
    df_combined.loc[:, 'diff_games_win_percent_hh'] = df_combined['player_0_games_win_percent_hh'] - df_combined[
        'player_1_games_win_percent_hh']

    df_combined.loc[:, 'diff_match_win_percent_grass_hh'] = df_combined['player_0_match_win_percent_grass_hh'] - \
                                                            df_combined['player_1_match_win_percent_grass_hh']
    df_combined.loc[:, 'diff_games_win_percent_grass_hh'] = df_combined['player_0_games_win_percent_grass_hh'] - \
                                                            df_combined['player_1_games_win_percent_grass_hh']

    return df_combined


#讀取英國網球公司的資料(2012-2020年)http://www.tennis-data.co.uk/alldata.php 
df = pd.DataFrame()
for i in range(2012, 2020):
    if i <=2012:
        link = str(i) + '.xls'
    else:
        link =  str(i) + '.xlsx'
    df_temp = pd.read_excel(link)
    df = df.append(df_temp)
df = df.reset_index()
#將沒有資料的欄位刪除
df = df[df.Date.notnull()]
#將時間格式改為(年/月/日)
df['Date']=df.apply(lambda row:(row['Date'].strftime("%Y/%m/%d")), axis=1)
df.reset_index(inplace=True)
del (df['index'])
#只選比賽有完成的數據
df = df[df.Comment == 'Completed']
#增加'Best of_5'欄位
df.loc[:,'best_of_5'] = (df['Best of'] ==5).astype(int)
#將'W3'欄位的資料由文字改為數字, 若不能改為數字則為NaN
df['W3'] = pd.to_numeric(df['W3'], errors='coerce')
#將'L3'欄位的資料由文字改為數字, 若不能改為數字則為NaN
df['L3'] = pd.to_numeric(df['L3'], errors='coerce')
#將NaN欄位補0
df = df.fillna(0)
#顯示資料型態
print(df.dtypes)

level_0         int64
ATP             int64
AvgL          float64
AvgW          float64
B365L         float64
B365W         float64
Best of         int64
Comment        object
Court          object
Date           object
EXL           float64
EXW            object
L1            float64
L2            float64
L3            float64
L4            float64
L5            float64
LBL           float64
LBW           float64
LPts          float64
LRank         float64
Location       object
Loser          object
Lsets         float64
MaxL          float64
MaxW          float64
PSL           float64
PSW           float64
Round          object
SJL           float64
SJW           float64
Series         object
Surface        object
Tournament     object
W1            float64
W2            float64
W3            float64
W4            float64
W5            float64
WPts          float64
WRank         float64
Winner         object
Wsets         float64
best_of_5       int64
dtype: object


建立訓練用資料表

#將2012-2010年的每場比賽的['Tournament',  'Date', 'Surface', 'Round']欄位複製到df_combined資料表
df_combined = df[['Tournament',  'Date', 'Surface', 'Round']].copy()

#擴增資料表內容
#增加排名較高的球員名稱欄位'player_0'
df_combined.loc[:,'player_0'] = df.apply(lambda row: get_player_1_name(row['Winner'], row['WRank'], row['Loser'], row['LRank']), axis=1)
#增加排名較高的球員排名欄位'player_0_rank'
df_combined.loc[:,'player_0_rank'] = df.apply(lambda row: get_player_1_rank(row['WRank'], row['LRank']), axis=1)
#增加排名較高的球員勝率欄位'player_0_odd'
df_combined.loc[:,'player_0_odd'] = df.apply(lambda row: get_player_1_odd(row['B365W'], row['WRank'], row['B365L'], row['LRank']), axis=1)
df_combined.head()
#增加比賽結果'player_0_odd', outcome==0代表0號選手贏, outcome==1代表1好選手贏
df_combined.loc[:,'outcome'] = df.apply(lambda row: outcome(row['WRank'], row['LRank']), axis=1)
df_combined
#選取溫網的比賽資料
df_combined = df_combined[df_combined.Tournament == 'Wimbledon']
df_combined
#選取訓練資料的比賽日期
df_combined = df_combined[df_combined.Date > '2010/01/01']
df_combined
#使用create_features函數建立訓練資料表
df_combined = create_features(df_combined, df) #訓練資料表共有71個特徵欄位
#除訓練用資料表
df_combined.to_csv('data/wimbledon_matches_with_feature.csv', index=False)

建立2019年要預測的比賽資料表


#匯入2019年溫網男子單打的比賽資料
df_2019=pd.read_csv('Wimbledon2019.csv',sep=';')
df_2019.head()
Roundplayer_0player_0_rankplayer_1player_1_rank
0Round of 16Djokovic N.1Humbert U.66
1Round of 16Goffin D.23Verdasco F.37
2Round of 16Raonic M.17Pella G.26
3Round of 16Bautista Agut R.22Paire B.32
4Round of 16Querrey S.65Sandgren T.94
5Round of 16Nadal R.2Sousa J.69
6Round of 16Nishikori K.7Kukushkin M.58
7Round of 16Federer R.3Berrettini M.20
8QuarterDjokovic N.1Goffin D.23
9QuarterRaonic M.17Bautista Agut R.22
10QuarterNadal R.2Querrey S.65
11QuarterFederer R.3Nishikori K.7
12SemisDjokovic N.1Bautista Agut R.22
13SemisNadal R.2Federer R.3
14FinalsDjokovic N.1Federer R.3
15FinalsDjokovic N.1Nadal R.2
#擴增資料表
df_2019['Date'] = '2019/07/07'
df_2019['Surface'] = 'Grass'
df_2019['diff_rank'] = df_2019['player_0_rank'] - df_2019['player_1_rank']
#製作預測資料表
df_2019 = create_features(df_2019, df)
#選擇要使用的特徵值
features_list = [
 'diff_rank',
 'diff_match_win_percent',
 'diff_games_win_percent',
 'diff_5_set_match_win_percent',
 'diff_close_sets_percent',
 'diff_match_win_percent_grass',
 'diff_games_win_percent_grass',
 'diff_5_set_match_win_percent_grass',
 'diff_close_sets_percent_grass',
 'diff_match_win_percent_52',
 'diff_games_win_percent_52',
 'diff_5_set_match_win_percent_52',
 'diff_close_sets_percent_52',
 'diff_match_win_percent_grass_60',
 'diff_games_win_percent_grass_60',
 'diff_5_set_match_win_percent_grass_60',
 'diff_close_sets_percent_grass_60',
 'diff_match_win_percent_hh',
 'diff_games_win_percent_hh',
 'diff_match_win_percent_grass_hh',
 'diff_games_win_percent_grass_hh']
features_16 = df_2019[features_list]

#輸出2019年要測試的資料表
features_16.to_csv('data/wimbledon_matches_with_feature_2019.csv', index=False)

三倍槓桿和一倍槓桿的長期定期定額報酬率分析

  以下是中國,美國股票債卷的三倍槓桿和一倍槓桿ETF分析.可以發現,三倍槓桿在下跌時期的跌幅遠比一倍槓桿的多 .且從時間軸來看,三倍槓桿由於下跌力道較強,因此會把之前的漲幅都吃掉,所以對於長期上身的市場,例如美國科技股,由於上升時間遠比下跌時間長,所以持有TQQQ的長期回報率會...