概述
本文分析旨在确定奥林匹克奖牌获得者在国籍、年龄、身高等方面的特征是否存在模式。我利用统计分析来确定这些因素是否与奥林匹克奖牌得主存在显著相关性,并确定奥林匹克成功的最有预测性的特征。
数据详情
Kaggle 数据集:120 年奥运历史上的运动员和成绩
data = pd.read_csv('athlete_events.csv')
data.head(5)
data.describe()
让我们根据具体运动项目“三项全能”探索数据。
#Triathlon sport for all countries
triathlon = data.loc[(data["Sport"] == 'Triathlon')]
total_count = triathlon.groupby("Season")[["Team", "NOC", "Event", "City", "Sport"]].nunique().reset_index()
total_count
奖牌数
triathlon['Medal'].value_counts()
plt.figure(figsize=(6,4))
sns.countplot(x='Medal', data=triathlon)
plt.show()
# Medals for Top 20 countries in summer olympics
total_medals_by_country = triathlon[triathlon['Season']=='Summer'].groupby(['NOC','Medal'])['Sex'].count().reset_index()
total_medals_by_country=total_medals_by_country.pivot('NOC','Medal','Sex').fillna(0).sort_values(by='Gold',ascending=False).head(20)
#total_medals_by_country
total_medals_by_country.plot(kind='bar')
fig = plt.gcf()
fig.set_size_inches(10, 7)
plt.xticks(rotation=90)
plt.xlabel('Country')
plt.ylabel("Athelete count")
plt.title('Medals by Country- Summer Olympics ')
plt.show()
男子奖牌分析
男子获奖者的年龄分析
让我们看看哪些国家在男子比赛中获得了最高比例的奖牌。
##Ploting data for total males participated per year
plt.figure(figsize=(7,4))
plotm = sns.barplot(x="Year", y="Total Males Per Year", data=year_df)
##Ploting data for medal winners
plt.figure(figsize=(8,5))
plota = sns.barplot(x="Year", y="Medal Winners", data=year_df)
男子体重分析
mdf = triathlon[triathlon['Sex'] == 'M']
mmedals = medals[medals['Sex'] == 'M']
medalweight = mmedals['Weight'].value_counts()
weighttotal = mdf['Weight'].value_counts()
weightdt = pd.DataFrame(weighttotal)
weightdt.reset_index()
weightdt['Medal Winners'] = pd.Series(medalweight)
weightdt = weightdt.rename(index=str, columns={"Weight": "Total Males Per Weight"})
weightdt['Percentage'] = weight_df['Medal Winners'] / weight_df['Total Males Per Weight'] * 100
weightdt = weightdt.dropna(axis=0)
weightdt = weightdt.reset_index()
weightdt = weightdt.rename(index=str, columns={"index": "Weight"})
weightdt = weightdt.sort_values(by=['Medal Winners'], ascending=False)
weightdt.head()
# plot for data for percentage
plt.figure(figsize=(10,5))
plot_tm = sns.barplot(x="Weight", y="Total Males Per Weight", data=weightdt)
plt.figure(figsize=(10,5))
plot_mwm = sns.barplot(x="Weight", y="Medal Winners", data=weightdt)
plt.figure(figsize=(10,5))
plot_pctm = sns.barplot(x="Weight", y="Percentage", data=weightdt)
mxmedal_weight=medals_wgt_athlete.groupby(['Weight'])['Total Medals'].max().reset_index(name='Max Medals')
plt.figure(figsize=(10,5))
plot_t = sns.barplot(x="Weight", y= "Max Medals", data=mxmedal_weight)
mxmedal_weight.head()
体重在 70 到 75 之间的男子获得的奖牌最多。
男子身高分析
medal_ht = medals['Height'].value_counts()
ht_total = mdf['Height'].value_counts()
ht_total = pd.DataFrame(ht_total)
ht_total.reset_index()
ht_total['Medal Winners'] = pd.Series(medal_ht)
ht_total = ht_total.rename(index=str, columns={"Height": "Total Males"})
ht_total['Percentage'] = ht_total['Medal Winners'] / ht_total['Total Males'] * 100
ht_total = ht_total.dropna(axis=0)
ht_total = ht_total.reset_index()
ht_total = ht_total.rename(index=str, columns={"index": "Height"})
ht_total = ht_total.sort_values(by=['Medal Winners'], ascending=False)
print(ht_total.head())
##Ploting data for medal winners
plt.figure(figsize=(10,5))
plot_mwf = sns.barplot(x="Height", y="Medal Winners", data=ht_total)
女子奖牌分析
# Get the counts for female medal earner
fmedal = fmedals['Sex'].value_counts()
# Get the total counts for female overall for all competitors
ftotal = fdf['Sex'].value_counts()
# Create a new dataframe with the data from the total females
fdf = pd.DataFrame(ftotal)
fdf.reset_index()
# Add the data for the winners and calculate the percentages for each female
fdf['Medal Winners'] = pd.Series(fdf)
fdf = fdf.rename(index=str, columns={"Sex": "Total Females"})
fdf['Percentage'] = fdf['Medal Winners'] / fdf['Total Females'] * 100
fdf = fdf.dropna(axis=0)
fdf.index.names = ['Sex']
fdf.head()
女子获奖者的年龄分析
# Get the counts of each age for each female medal earner
fmedal_ages = fmedals['Age'].value_counts()
# Get the total counts for age overall for all female competitors
fage_total = f_df['Age'].value_counts()
# Create a new dataframe with the data from the total ages
fage_df = pd.DataFrame(fage_total)
fage_df.reset_index()
# Add the data for the winners and calculate the percentages for each age
fage_df['Winners'] = pd.Series(fage_df)
fage_df = fage_df.rename(index=str, columns={"Age": "Total"})
fage_df['Percentage'] = fage_df['Winners'] / fage_df['Total'] * 100
fage_df = fage_df.dropna(axis=0)
fage_df = fage_df.reset_index()
fage_df = fage_df.rename(index=str, columns={"index": "Age"})
# Convert all of the ages from floats to ints
fage_df['Age'] = pd.to_numeric(fage_df['Age'], errors='coerce')
fage_df['Age'] = fage_df['Age'].astype(int)
# Plot the data
plt.figure(figsize=(10,5))
女子体重分析
fmedal_wt = fmedals['Weight'].value_counts()
fwt_total = f_df['Weight'].value_counts()
fwt_df = pd.DataFrame(fwt_total)
fwt_df.reset_index()
fwt_df['Medal Winners'] = pd.Series(fwt_df)
fwt_df = fwt_df.rename(index=str, columns={"Weight": "Total Females Per Weight"})
fwt_df['Percentage'] = fwt_df['Medal Winners'] / fwt_df['Total Females Per Weight'] * 100
fwt_df = fwt_df.dropna(axis=0)
fwt_df = fwt_df.reset_index()
fwt_df = fwt_df.rename(index=str, columns={"index": "Weight"})
fwt_df = fwt_df.sort_values(by=['Medal Winners'], ascending=False)
print(fwt_df.head())
plt.figure(figsize=(10,5))
plot_tf = sns.barplot(x="Weight", y="Total Females Per Weight", data=fwt_df)
plt.figure(figsize=(10,5))
plot_mwf = sns.barplot(x="Weight", y="Medal Winners", data=fwt_df)
plt.figure(figsize=(10,5))
plot_pctf = sns.barplot(x="Weight", y="Percentage", data=fwt_df)
女子身高分析
fmedal_ht = fmedals['Height'].value_counts()
fht_total = f_df['Height'].value_counts()
fht_df = pd.DataFrame(fht_total)
fht_df.reset_index()
fht_df['Medal Winners'] = pd.Series(fht_df)
fht_df = f_height_df.rename(index=str, columns={"Height": "Total Females"})
fht_df['Percentage'] = fht_df['Medal Winners'] / fht_df['Total Females'] * 100
fht_df = fht_df.dropna(axis=0)
fht_df = fht_df.reset_index()
fht_df = fht_df.rename(index=str, columns={"index": "Height"})
fht_df = fht_df.sort_values(by=['Medal Winners'], ascending=False)
print(fht_df.head())
##Ploting data for medal winners
plt.figure(figsize=(10,5))
plot_mwf = sns.barplot(x="Height", y="Medal Winners", data=fht_df)
# 在最近的奥运会中获得金牌数量最多的国家
# Visulaizing the countries with most Gold medals
plt.figure(figsize=(12,5))
sns.barplot(x=max_year_country, y=max_year_country.index)
plt.xlabel("Top 20 Gold Medal Won Countrywise")
数据清洗
print("data shape",triathlon.shape)
print("\n")
print("Total null values in data:", triathlon.isnull().sum())
# Binary Encoding for Medal
triathlon['Medal'] = triathlon['Medal'].apply(lambda x: 1 if str(x) != 'nan' else 0)
# groupby for medal and gender then calculate mean for numerical features
triathlon.groupby(['Medal', 'Sex']).mean().astype(np.int)
missing_values_columns = [col for col in triathlon.columns if triathlon.isnull().sum()[col] > 0]
missing_values_columns
# function for missing values substitution
def fill_missing_values(data,missing_values_columns):
df = data.copy()
for col in missing_values_columns:
df[col] = df.groupby(['Medal', 'Sex'])[col].apply(lambda x: x.fillna(x.mean()).astype(np.int))
return df
# lets use this function to fill the missing values
triathlon = fill_missing_values(triathlon,missing_values_columns)
for column in ['Age', 'Height', 'Weight']:
triathlon[column] = triathlon.groupby(['Medal', 'Sex'])[column].apply(lambda x: x.fillna(x.mean()).astype(np.int))
# Checking null values again
print("Total missing values:", triathlon.isna().sum().sum())
Total missing values: 0
data['Sex'] = data['Sex'].replace('F', 1)
data['Sex'] = data['Sex'].replace('M', 0)
# Drop Uncessary feature columns
data = data.drop(['ID', 'Name', 'Games', 'Season'], axis=1)
特征选择
#Apply SelectKBest and extract top 10 features out of the data
best = SelectKBest(score_func=chi2, k=10)
fit = best.fit(X,y)
data_scores=pd.DataFrame(fit.scores_)
data_columns=pd.DataFrame(X.columns)
# Join the two dataframes
scores=pd.concat([data_columns,data_scores],axis=1)
scores.columns=['Feature','Score']
print(scores.nlargest(10,'Score'))
Feature Score
53 T_Switzerland 9.595257
105 N_SUI 9.595257
6 T_Australia 7.257331
24 T_Great Britain 7.257331
61 N_AUS 7.257331
79 N_GBR 7.257331
52 T_Sweden 2.793604
107 N_SWE 2.793604
41 T_New Zealand 2.092209
98 N_NZL 2.092209
# Select 10 features
features=scores["Feature"].tolist()[:10]
features
['Sex',
'Age',
'Height',
'Weight',
'Year',
'T_Argentina',
'T_Australia',
'T_Austria',
'T_Azerbaijan',
'T_Barbados']
#create new data frame for modeling
data=data[['Age','Sex','Height','Weight','Year','T_Argentina','T_Barbados',
'T_Australia','T_Australia','T_Australia', 'T_United States','Medal']]
data.head()
y = data['Medal']
X = data.drop(['Medal'], axis=1)
#Split data into training and testing data
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.4, random_state=1)
# Scaling data
scaler = MinMaxScaler()
train_x = scaler.fit_transform(train_x)
test_x = scaler.transform(test_x)
模型预测
1. 逻辑回归
algo1 = 'LogisticRegression'
lr = LogisticRegression(random_state=1, max_iter=1000)
lr.fit(train_x, train_y)
lr_predict = lr.predict(test_x)
lr_conf_matrix = confusion_matrix(test_y, lr_predict)
lr_acc_score = accuracy_score(test_y, lr_predict)
print("confusion matrix")
print(lr_conf_matrix)
print("\n")
print("Accuracy of Logistic Regression:",lr_acc_score*100,'\n')
print(classification_report(test_y,lr_predict))
结论与讨论
- 从这个分析中,我们发现24岁左右的男性赢得了更多的奖牌,而女性在27至35岁之间赢得了最多的奖牌。
- 根据我们的体重分析,体重在60至80磅之间的男性和女性更有可能赢得奖牌,但这可能是因为这个体重范围内有更多的奖牌可获得,更多的运动员在这个体重范围内比赛。
- 身高和体重分析是最令人震惊的。传统上,我们认为体重更重、身高更高的运动员在身体能力方面更有优势,但我们的研究表明这是错误的。
- 看来体重和身高平均的人更成功,这是有道理的,因为平均身高的人更多,但我们仍然惊讶于他们更大的对手没有统治他们。奥运会运动员的体重、身高和表现之间没有显著的相关性。运动员参加比赛的年份对他们是否获得奖牌没有显著影响。
参考资料:
- https://www.kaggle.com/datasets/heesoo37/120-years-of-olympic-history-athletes-and-results
- https://www.tableau.com/data-insights/dashboard-showcase
- https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
- https://statisticsglobe.com/dataframe-manipulation-using-pandas-python
译自:https://swayanshu.medium.com/olympics-data-analysis-for-triathlon-sport-6234fae0c40f
评论(0)