首页
Preview

奥运会铁人三项运动数据分析

概述

本文分析旨在确定奥林匹克奖牌获得者在国籍、年龄、身高等方面的特征是否存在模式。我利用统计分析来确定这些因素是否与奥林匹克奖牌得主存在显著相关性,并确定奥林匹克成功的最有预测性的特征。

数据详情

Kaggle 数据集:120 年奥运历史上的运动员和成绩

data = pd.read_csv('athlete_events.csv')
data.head(5)
data.describe()

让我们根据具体运动项目“三项全能”探索数据。

#Triathlon sport for all countries
triathlon = data.loc[(data["Sport"] == 'Triathlon')]
total_count = triathlon.groupby("Season")[["Team", "NOC", "Event", "City", "Sport"]].nunique().reset_index()
total_count

EDA 仪表板链接

地图

奖牌数

triathlon['Medal'].value_counts()
plt.figure(figsize=(6,4))
sns.countplot(x='Medal', data=triathlon)
plt.show()

# Medals for Top 20 countries in summer olympics
total_medals_by_country = triathlon[triathlon['Season']=='Summer'].groupby(['NOC','Medal'])['Sex'].count().reset_index()
total_medals_by_country=total_medals_by_country.pivot('NOC','Medal','Sex').fillna(0).sort_values(by='Gold',ascending=False).head(20)
#total_medals_by_country
total_medals_by_country.plot(kind='bar')
fig = plt.gcf()
fig.set_size_inches(10, 7)
plt.xticks(rotation=90)
plt.xlabel('Country')
plt.ylabel("Athelete count")
plt.title('Medals by Country- Summer Olympics ')
plt.show()

男子奖牌分析

男子获奖者的年龄分析

让我们看看哪些国家在男子比赛中获得了最高比例的奖牌。

##Ploting data for total males participated per year
plt.figure(figsize=(7,4))
plotm = sns.barplot(x="Year", y="Total Males Per Year", data=year_df)
##Ploting data for medal winners
plt.figure(figsize=(8,5))
plota = sns.barplot(x="Year", y="Medal Winners", data=year_df)

男子体重分析

mdf = triathlon[triathlon['Sex'] == 'M']
mmedals = medals[medals['Sex'] == 'M']
medalweight = mmedals['Weight'].value_counts()
weighttotal = mdf['Weight'].value_counts()
weightdt = pd.DataFrame(weighttotal)
weightdt.reset_index()
weightdt['Medal Winners'] = pd.Series(medalweight)
weightdt = weightdt.rename(index=str, columns={"Weight": "Total Males Per Weight"})
weightdt['Percentage'] = weight_df['Medal Winners'] / weight_df['Total Males Per Weight'] * 100
weightdt = weightdt.dropna(axis=0)
weightdt = weightdt.reset_index()
weightdt = weightdt.rename(index=str, columns={"index": "Weight"})
weightdt = weightdt.sort_values(by=['Medal Winners'], ascending=False)
weightdt.head()
# plot for data for percentage
plt.figure(figsize=(10,5))
plot_tm = sns.barplot(x="Weight", y="Total Males Per Weight", data=weightdt)

plt.figure(figsize=(10,5))
plot_mwm = sns.barplot(x="Weight", y="Medal Winners", data=weightdt)

plt.figure(figsize=(10,5))
plot_pctm = sns.barplot(x="Weight", y="Percentage", data=weightdt)

mxmedal_weight=medals_wgt_athlete.groupby(['Weight'])['Total Medals'].max().reset_index(name='Max Medals')
plt.figure(figsize=(10,5))
plot_t = sns.barplot(x="Weight", y= "Max Medals", data=mxmedal_weight)
mxmedal_weight.head()

体重在 70 到 75 之间的男子获得的奖牌最多。

男子身高分析

medal_ht = medals['Height'].value_counts()
ht_total = mdf['Height'].value_counts()
ht_total = pd.DataFrame(ht_total)
ht_total.reset_index()
ht_total['Medal Winners'] = pd.Series(medal_ht)
ht_total = ht_total.rename(index=str, columns={"Height": "Total Males"})
ht_total['Percentage'] = ht_total['Medal Winners'] / ht_total['Total Males'] * 100
ht_total = ht_total.dropna(axis=0)
ht_total = ht_total.reset_index()
ht_total = ht_total.rename(index=str, columns={"index": "Height"})
ht_total = ht_total.sort_values(by=['Medal Winners'], ascending=False)
print(ht_total.head())

##Ploting data for medal winners
plt.figure(figsize=(10,5))
plot_mwf = sns.barplot(x="Height", y="Medal Winners", data=ht_total)

女子奖牌分析

# Get the counts for female medal earner
fmedal = fmedals['Sex'].value_counts()

# Get the total counts for female overall for all competitors
ftotal = fdf['Sex'].value_counts()

# Create a new dataframe with the data from the total females
fdf = pd.DataFrame(ftotal)
fdf.reset_index()

# Add the data for the winners and calculate the percentages for each female
fdf['Medal Winners'] = pd.Series(fdf)
fdf = fdf.rename(index=str, columns={"Sex": "Total Females"})
fdf['Percentage'] = fdf['Medal Winners'] / fdf['Total Females'] * 100
fdf = fdf.dropna(axis=0)
fdf.index.names = ['Sex']
fdf.head()

女子获奖者的年龄分析

# Get the counts of each age for each female medal earner
fmedal_ages = fmedals['Age'].value_counts()

# Get the total counts for age overall for all female competitors
fage_total = f_df['Age'].value_counts()

# Create a new dataframe with the data from the total ages
fage_df = pd.DataFrame(fage_total)
fage_df.reset_index()

# Add the data for the winners and calculate the percentages for each age
fage_df['Winners'] = pd.Series(fage_df)
fage_df = fage_df.rename(index=str, columns={"Age": "Total"})
fage_df['Percentage'] = fage_df['Winners'] / fage_df['Total'] * 100
fage_df = fage_df.dropna(axis=0)
fage_df = fage_df.reset_index()
fage_df = fage_df.rename(index=str, columns={"index": "Age"})

# Convert all of the ages from floats to ints
fage_df['Age'] = pd.to_numeric(fage_df['Age'], errors='coerce')
fage_df['Age'] = fage_df['Age'].astype(int)

# Plot the data
plt.figure(figsize=(10,5))

女子体重分析

fmedal_wt = fmedals['Weight'].value_counts()
fwt_total = f_df['Weight'].value_counts()
fwt_df = pd.DataFrame(fwt_total)
fwt_df.reset_index()
fwt_df['Medal Winners'] = pd.Series(fwt_df)
fwt_df = fwt_df.rename(index=str, columns={"Weight": "Total Females Per Weight"})
fwt_df['Percentage'] = fwt_df['Medal Winners'] / fwt_df['Total Females Per Weight'] * 100
fwt_df = fwt_df.dropna(axis=0)
fwt_df = fwt_df.reset_index()
fwt_df = fwt_df.rename(index=str, columns={"index": "Weight"})
fwt_df = fwt_df.sort_values(by=['Medal Winners'], ascending=False)
print(fwt_df.head())

plt.figure(figsize=(10,5))
plot_tf = sns.barplot(x="Weight", y="Total Females Per Weight", data=fwt_df)

plt.figure(figsize=(10,5))
plot_mwf = sns.barplot(x="Weight", y="Medal Winners", data=fwt_df)

plt.figure(figsize=(10,5))
plot_pctf = sns.barplot(x="Weight", y="Percentage", data=fwt_df)

女子身高分析

fmedal_ht = fmedals['Height'].value_counts()
fht_total = f_df['Height'].value_counts()
fht_df = pd.DataFrame(fht_total)
fht_df.reset_index()
fht_df['Medal Winners'] = pd.Series(fht_df)
fht_df = f_height_df.rename(index=str, columns={"Height": "Total Females"})
fht_df['Percentage'] = fht_df['Medal Winners'] / fht_df['Total Females'] * 100
fht_df = fht_df.dropna(axis=0)
fht_df = fht_df.reset_index()
fht_df = fht_df.rename(index=str, columns={"index": "Height"})
fht_df = fht_df.sort_values(by=['Medal Winners'], ascending=False)
print(fht_df.head())

##Ploting data for medal winners
plt.figure(figsize=(10,5))
plot_mwf = sns.barplot(x="Height", y="Medal Winners", data=fht_df)

# 在最近的奥运会中获得金牌数量最多的国家

# Visulaizing the countries with most Gold medals
plt.figure(figsize=(12,5))
sns.barplot(x=max_year_country, y=max_year_country.index)
plt.xlabel("Top 20 Gold Medal Won Countrywise")

数据清洗

print("data shape",triathlon.shape)
print("\n")
print("Total null values in data:", triathlon.isnull().sum())
# Binary Encoding for Medal
triathlon['Medal'] = triathlon['Medal'].apply(lambda x: 1 if str(x) != 'nan' else 0)
# groupby for medal and gender then calculate mean for numerical features 
triathlon.groupby(['Medal', 'Sex']).mean().astype(np.int)

missing_values_columns = [col for col in triathlon.columns if triathlon.isnull().sum()[col] > 0]
missing_values_columns

# function for missing values substitution
def fill_missing_values(data,missing_values_columns):
    df = data.copy()
    for col in missing_values_columns:
        df[col] = df.groupby(['Medal', 'Sex'])[col].apply(lambda x: x.fillna(x.mean()).astype(np.int))
    return df

# lets use this function to fill the missing values
triathlon = fill_missing_values(triathlon,missing_values_columns)
for column in ['Age', 'Height', 'Weight']:
    triathlon[column] = triathlon.groupby(['Medal', 'Sex'])[column].apply(lambda x: x.fillna(x.mean()).astype(np.int))

# Checking null values again
print("Total missing values:", triathlon.isna().sum().sum())
Total missing values: 0
data['Sex'] = data['Sex'].replace('F', 1)
data['Sex'] = data['Sex'].replace('M', 0)
# Drop Uncessary feature columns
data = data.drop(['ID', 'Name', 'Games', 'Season'], axis=1)

特征选择

#Apply SelectKBest and extract top 10 features out of the data
best = SelectKBest(score_func=chi2, k=10)
fit = best.fit(X,y)
data_scores=pd.DataFrame(fit.scores_)
data_columns=pd.DataFrame(X.columns)
# Join the two dataframes
scores=pd.concat([data_columns,data_scores],axis=1)
scores.columns=['Feature','Score']
print(scores.nlargest(10,'Score'))
Feature     Score
53     T_Switzerland  9.595257
105            N_SUI  9.595257
6        T_Australia  7.257331
24   T_Great Britain  7.257331
61             N_AUS  7.257331
79             N_GBR  7.257331
52          T_Sweden  2.793604
107            N_SWE  2.793604
41     T_New Zealand  2.092209
98             N_NZL  2.092209

# Select 10 features
features=scores["Feature"].tolist()[:10]
features
['Sex',
 'Age',
 'Height',
 'Weight',
 'Year',
 'T_Argentina',
 'T_Australia',
 'T_Austria',
 'T_Azerbaijan',
 'T_Barbados']
#create new data frame for modeling
data=data[['Age','Sex','Height','Weight','Year','T_Argentina','T_Barbados',
           'T_Australia','T_Australia','T_Australia', 'T_United States','Medal']]
data.head()

y = data['Medal']
X = data.drop(['Medal'], axis=1)
#Split data into training and testing data
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.4, random_state=1)

# Scaling data 
scaler = MinMaxScaler()
train_x = scaler.fit_transform(train_x)
test_x = scaler.transform(test_x)

模型预测

1. 逻辑回归

algo1 = 'LogisticRegression'
lr = LogisticRegression(random_state=1, max_iter=1000)
lr.fit(train_x, train_y)
lr_predict = lr.predict(test_x)
lr_conf_matrix = confusion_matrix(test_y, lr_predict)
lr_acc_score = accuracy_score(test_y, lr_predict)
print("confusion matrix")
print(lr_conf_matrix)
print("\n")
print("Accuracy of Logistic Regression:",lr_acc_score*100,'\n')
print(classification_report(test_y,lr_predict))

结论与讨论

  • 从这个分析中,我们发现24岁左右的男性赢得了更多的奖牌,而女性在27至35岁之间赢得了最多的奖牌。
  • 根据我们的体重分析,体重在60至80磅之间的男性和女性更有可能赢得奖牌,但这可能是因为这个体重范围内有更多的奖牌可获得,更多的运动员在这个体重范围内比赛。
  • 身高和体重分析是最令人震惊的。传统上,我们认为体重更重、身高更高的运动员在身体能力方面更有优势,但我们的研究表明这是错误的。
  • 看来体重和身高平均的人更成功,这是有道理的,因为平均身高的人更多,但我们仍然惊讶于他们更大的对手没有统治他们。奥运会运动员的体重、身高和表现之间没有显著的相关性。运动员参加比赛的年份对他们是否获得奖牌没有显著影响。

参考资料:

译自:https://swayanshu.medium.com/olympics-data-analysis-for-triathlon-sport-6234fae0c40f

版权声明:本文内容由TeHub注册用户自发贡献,版权归原作者所有,TeHub社区不拥有其著作权,亦不承担相应法律责任。 如果您发现本社区中有涉嫌抄袭的内容,填写侵权投诉表单进行举报,一经查实,本社区将立刻删除涉嫌侵权内容。

点赞(0)
收藏(0)
alivne
复杂的问题简单化

评论(0)

添加评论