필요한 라이브러리 불러오기
import pandas as pd
import numpy as np
import math
import stats
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, StratifiedKFold,
cross_val_score
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler
# ensemble
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, \
ExtraTreesClassifier, GradientBoostClassifier, VotingClassifier
# lightgbm
import lightgbm as lgb
# xgboost
import xgboost as xgb
# svc
from sklearn.svm import SVC
# knn
from sklearn.neighbors import KNeighborsClassifier
# tree
from sklearn.tree import DecisionTreeClassifier
# linear
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, \
precision_score, recall_score, roc_auc_score, confusion_matrix, classification_report
Python
복사
baseline 모델 & cross validation
def basic_model_evaluation(X:pd.DataFrame, y:pd.DataFrame, eval_metric:str, cv:int, random_state=42)
classifiers = [
KNeighborsClassifier(7), # 임의로 설정
SVC(probability=True, random_state=random_state),
DecisionTreeClassifier(random_state=random_state),
AdaBoostingClassifier(random_state=random_state),
GradientBoostingClassifier(random_state=random_state),
ExtraTreesClassifier(random_state=random_state),
lgb.LGBMClassifier(random_state=random_state),
xgb.XGBClassifier(random_state=random_state),
LogisticRegression(random_state=random_state)
]
for model in classifiers:
try:
cv_score = cross_val_score(model, X, y, scoring=eval_metric, cv=cv)
print('model: {0}의 f1_score(mean,std) : ({1:.4f},{2:.4f})'.format(model.__class__.__name__, cv_score.mean(), cv_score.std()))
except:
print('model: {0}의 f1_score 측정 불가!!!'.format(model.__class__.__name__))
Python
복사
불균형을 반영한 cross validation 함수
train[CFG.target].value_counts(normalize=True)
Python
복사
타겟 변수가 불균형이라는 점을 감안해서 predict_proba 확률이 0.36 이상이면 1로 예측하도록 조정하고, cross-validation 을 구하는 함수를 만든다.
def skf_fold_evaluation(X:pd.DataFrame, y, random_state=42):
skf = StratifiedKFold(n_splits=10)
n_iter=0
classifiers = [
KNeighborsClassifier(7),
SVC(probability=True, random_state=random_state),
DecisionTreeClassifier(random_state=random_state),
AdaBoostingClassifier(random_state=random_state),
GradientBoostingClassifier(random_state=random_state),
ExtraTreesClassifier(random_state=random_state),
lgb.LGBMClassifier(random_state=random_state),
xgb.XGBClassifier(random_state=random_state),
LogisticRegression(random_state=random_state)
]
for model in classifiers:
score_list = np.zeros(10)
score_f1_list = np.zeros(10)
score_precision_list = np.zeros(10)
score_recall_list = np.zeros(10)
score_roc_auc_list = np.zeros(10)
n_iter = 0
for train_index, test_index in skf.split(X,y):
n_iter += 1
X_train, X_test = X.iloc[train_index], X.iloc[test_index]
y_train, y_test = y.iloc[train_index], y.iloc[test_index]
model.fit(X_train, y_train)
model_pred = model.predict(X_test)[:,1] >= 0.36
fold_accuracy = accuracy_score(y_test, model_pred)
fold_f1_score = f1_score(y_test, model_pred, average='macro')
fold_precision_score = precision_score(y_test, model_pred)
fold_recall_score = recall_score(y_test, model_pred)
fold_roc_auc_score = roc_auc_score(y_test, model_pred)
score_list[n_iter - 1] = fold_accuracy
score_f1_list[n_iter - 1] = fold_f1_accuracy
score_precision_list[n_iter - 1] = fold_precision_score
score_recall_list[n_iter - 1] = fold_recall_score
score_roc_auc_list[n_iter - 1] = fold_roc_auc_score
print('='*20 + 'model:', model.__class__.__name__ + '='*20)
print('(acc,f1,pre,recall,roc)=({0:.4f},{1:.4f},{2:.4f},{3:.4f},{4:.4f})'.format(score_list.mean(), score_f1_list.mean(), score_precision_list.mean(), score_recall_list.mean(), score_roc_auc_list.mean()))
Python
복사
기초 EDA 분석 함수
def data_info(df:pd.DataFrame) -> None:
print('column 개수:', len(df.columns))
print('데이터 개수:', len(df))
print('데이터의 shape:', df.shape)
print('categorical 변수의 개수:', len(df.columns[df.dtypes=='object']))
print('categorical 컬럼:', df.columns[df.dtypes=='object'])
print('numerical 변수의 개수:', len(df.columns[df.dtypes!='object']))
print('결측치 개수:', df.isnull().sum().sum())
print('='*50)
print('컬럼 이름:',df.columns)
Python
복사
변수 개수, 변수 이름, 데이터 개수, shape, 범주형 변수 개수, 수치형 변수 개수, 결측치 개수
더 다양한 EDA 함수
# 범주형 컬럼 분포, 타겟 변수 비율 확인
def search_columns(df:pd.DataFrame, columns:str, target:str) -> None:
print('컬럼명:', columns)
print('컬럼 분포 ', df[columns].value_counts(normalize=True).sort_values(ascending=True))
print('범주 별 target 비율:', df.groupby(columns)[target].mean())
# 컬럼별 타겟 변수 분포 시각화
def search_target_ratio(df:pd.DataFrame, columns:str, target:str) -> None:
return pd.crosstab(df[columns], df[target], margins=True, normalize='index').style.background_gradient(cmap='summer_r')
# 연속형 변수에 대한 plot 그리기
def numerical_plot(df:pd.DataFrame, columns:list, nrows:int, ncols:int, row_size:int, col_size:int) -> None:
fig, axes = plt.subplots(nrows, ncols, figsize=(row_size,col_size))
count=0
for i in range(nrows):
for j in range(ncols):
sns.distplot(x=df[columns[count]],ax=axes[i][j])
axes[i][j].set_title(columns[count] + 'column distribution')
axes[i][j].set_xlabel(columns[count])
count += 1
def numerical_target_group_ratio(df:pd.DataFrame, columns:list, nrows:int, ncols:int, row_size:int, col_size:int) -> None:
fig, axes = plt.subplots(nrows, ncols, figsize=(row_size,col_size))
count = 0
for i in range(nrows):
for j in range(ncols):
counts = df.groupby(columns[count])[columns[-1]].value_counts(normalize=True).rename('percentage').mul(100).reset_index()
sns.barplot(x=columns[count], y='percentage', hue=columns[-1], data=counts, ax=axes[i][j])
axes[i][j].set_title(columns[count] + 'distribution')
axes[i][j].set_xlabel(columns[count])
axes[i][j].set_xticks([0,2.5,5,7.5,10])
count += 1
# 상관계수 히트맵 그리기
def correlation_matrix(df:pd.DataFrame, numerical_columns:list, figsize:tuple) -> None:
colormap = plt.cm.PuBu
plt.title('Pearson Correlation of data')
plt.figure(figsize=figsize)
if len(numerical_columns) <= 20:
sns.heatmap(df[numerical_columns].astype(float).corr(), linewidth=0.1, vmax=1.0,\
square=True, cmap=colormap, linecolor='white', annot=True, annot_kws={'size':16})
else:
sns.heatmap(df[numerical_columns].astype(float).corr(), linewidth=0.1, vmax=1.0,\
square=True, cmap=colormap)
def describe_matrix(df:pd.DataFrame) -> pd.DataFrame:
return df.describe()
Python
복사
Data Preprocessing 및 Scaling
# position Obejct 컬럼 축소
def position_convert(x:str) -> str:
attack=['ST','RW','CF','LW']
midfield=['CDM','CAM','CM','RM','LM']
defend=['CB','RB','LB','RWB','LWB']
goalkeeper=['GK']
if x in attack:
return 'FW'
elif x in midfield:
return 'MF'
elif x in defend:
return 'DF'
else:
return 'GK'
# 범주형 변수에 대하여 라벨인코딩 진행
def categorical_column_preprocessing(train_df:pd.DataFrame, test_df:pd.DataFrame, columns:list) -> (pd.DataFrame, pd.DataFrame):
for column in columns:
le = LabelEncoder()
le.fit(train_df[column])
print(column + 'label encoder classes', le.classes_)
train_df[column] = le.transform(train_df[column])
test_df[column] = le.transform(test_df[column])
return (train_df, test_df)
# 연속형 변수에 대해 표준화 진행
def standard_scaler(train_df:pd.DataFrame, test_df:pd.DataFrame, categorical_columns, numeric_columns:list) -> (pd.DataFrame, pd.DataFrame):
scaler = StandardScaler()
scaler.fit(train_df[numeric_columns])
x_scaled = pd.DataFrame(scaler.transform(train_df[numeric_columns]), columns=numeric_columns)
x_test_scaled = pd.DataFrame(scaler.transform(test_df[numeric_columns]), columns=numeric_columns)
train_data = pd.concat([train_df[categorical_columns], x_scaled], axis=1)
test_data = pd.concat([test_df[categorical_columns], x_test_scaled], axis=1)
return (train_data, test_data)
def minmax_scaler(train_df:pd.DataFrame, test_df:pd.DataFrame, categorical_columns, numeric_columns:list) -> (pd.DataFrame, pd.DataFrame):
scaler = MinMaxScaler()
scaler.fit(train_df[numeric_columns])
x_scaled = pd.DataFrame(scaler.transform(train_df[numeric_columns]), columns=numeric_columns)
x_test_scaled = pd.DataFrame(scaler.transform(test_df[numeric_columns]), columns=numeric_columns)
train_data = pd.concat([train_df[categorical_columns], x_scaled], axis=1)
test_data = pd.concat([test_df[categorical_columns], x_test_scaled], axis=1)
return (train_data, test_data)
# 학습데이터와 테스트데이터 비율 나누기
def train_test(X:pd.DataFrame, y:pd.DataFrame, test_size:float, random_state=42) -> (pd.DataFrame,pd.DataFrame,pd.DataFrame,pd.DataFrame):
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=y)
return X_train, X_valid, y_train, y_valid
Python
복사
파일 불러오기
불러올 때 필요없는 변수, 중복 모두 제거하고 index도 초기화한다
train_df = pd.read_csv('train.csv').drop('ID', axis=1).drop_duplicates().reset_index(drop=True)
test_df = pd.read_csv('test.csv').drop('ID', axis=1)
submission_df = pd.read_csv('sample_submission.csv')
Python
복사
파생변수 생성 및 전처리
train_df['Position'].apply(lambda x: position_convert(x))
test_df['Position'].apply(lambda x: position_convert(x))
# 나이 범주 변수 추가
train_df['age_convert'] = train_df['Age'].apply(lambda x: ('young' if x < 20 else 'adult'))
test_df['age_convert'] = test_df['Age'].apply(lambda x: ('young' if x < 20 else 'adult'))
# 키 범주 생성
train_df['Height_cat'] = 0
test_df['Height_cat'] = 0
train_df.loc[train_df['Height'] <= 176.0, 'Height_cat'] = 1
test_df.loc[test_df['Height'] <= 176.0, 'Height_cat'] = 1
train_df.loc[(train_df['Height'] > 176.0)&(train_df['Height'] <= 180.0),'Height_cat'] = 2
test_df.loc[(test_df['Height'] > 176.0)&(test_df['Height'] <= 180.0),'Height_cat'] = 2
train_df.loc[(train_df['Height'] > 180.0) & (train_df['Height'] <= 185.75), 'Height_cat'] = 3
test_df.loc[(test_df['Height'] > 180.0) & (test_df['Height'] <= 185.75), 'Height_cat'] = 3
train_df.loc[(train_df['Height'] > 185.75), 'Height_cat'] = 4
test_df.loc[(test_df['Height'] > 185.75), 'Height_cat'] = 4
# 몸무게 범주 생성
train_df['Weight_cat'] = 0
test_df['Weight_cat'] = 0
train_df.loc[train_df['Weight'] <= 70.0, 'Weight_cat'] = 1
test_df.loc[test_df['Weight'] <= 70.0, 'Weight_cat'] = 1
train_df.loc[(train_df['Weight'] > 70.0)&(train_df['Weight'] <= 75.0),'Weight_cat'] = 2
test_df.loc[(test_df['Weight'] > 70.0)&(test_df['Weight'] <= 75.0),'Weight_cat'] = 2
train_df.loc[(train_df['Weight'] > 75.0), 'Weight_cat'] = 3
test_df.loc[(test_df['Weight'] > 75.0), 'Weight_cat'] = 3
# 피처 다중공선성 띄는 변수들 합쳐서 새로운 파생 변수 생성
train_df['GK_ability'] = (train_df['GKRating'] + train_df['GKHandling'] + train_df['GKKicking'] + train_df['GKPositioning'] + train_df['GKReflexes']) / 5
test_df['GK_ability'] = (test_df['GKRating'] + test_df['GKHandling'] + test_df['GKKicking'] + test_df['GKPositioning'] + test_df['GKReflexes']) / 5
train_df['FW_ability'] = (train_df['ShotPower'] + train_df['LongShots'] + train_df['Aggression'] + train_df['Volleys'] + train_df['Finishing'] + train_df['HeadingAccuracy'] + train_df['Dribbling']+ train_df['Curve'] + train_df['Crossing']) / 9
test_df['FW_ability'] = (test_df['ShotPower'] + test_df['LongShots'] + test_df['Aggression'] + test_df['Volleys'] + test_df['Finishing'] + test_df['HeadingAccuracy'] + test_df['Dribbling']+ test_df['Curve'] + test_df['Crossing']) / 9
train_df['MF_ability'] = (train_df['Crossing'] + train_df['ShortPassing'] + train_df['Dribbling'] + train_df['Curve'] + train_df['LongPassing'] + train_df['BallControl'] + train_df['Positioning'] + train_df['Vision'] + train_df['Composure']) / 9
test_df['MF_ability'] = (test_df['Crossing'] + test_df['ShortPassing'] + test_df['Dribbling'] + test_df['Curve'] + test_df['LongPassing'] + test_df['BallControl'] + test_df['Positioning'] + test_df['Vision'] + test_df['Composure']) / 9
train_df['DF_ability'] = (train_df['Marking'] + train_df['StandingTackle'] + train_df['SlidingTackle']) / 3
test_df['DF_ability'] = (test_df['Marking'] + test_df['StandingTackle'] + test_df['SlidingTackle'])/ 3
train_df['DFRating'] = (train_df['LWBRating'] + train_df['LBRating'] + train_df['CBRating'] + train_df['RBRating'] + train_df['RWBRating']) /5
test_df['DFRating'] = (test_df['LWBRating'] + test_df['LBRating'] + test_df['CBRating'] + test_df['RBRating'] + test_df['RWBRating']) /5
train_df['MFRating'] = (train_df['CAMRating'] + train_df['LMRating'] + train_df['CMRating'] + train_df['RMRating'] + train_df['CDMRating']) /5
test_df['MFRating'] = (test_df['CAMRating'] + test_df['LMRating'] + test_df['CMRating'] + test_df['RMRating'] + test_df['CDMRating']) /5
train_df['FWRating'] = (train_df['STRating'] + train_df['LWRating'] + train_df['LFRating'] + train_df['CFRating'] + train_df['RFRating'] + train_df['RWRating']) /6
test_df['FWRating'] = (test_df['STRating'] + test_df['LWRating'] + test_df['LFRating'] + test_df['CFRating'] + test_df['RFRating'] + test_df['RWRating']) /6
train_df['Maximum_Rating_Position'] = train_df[['DFRating', 'FWRating', 'GKRating', 'MFRating']].idxmax(axis = 1)
test_df['Maximum_Rating_Position'] = test_df[['DFRating', 'FWRating', 'GKRating', 'MFRating']].idxmax(axis = 1)
train_df['Physical_ability'] = (train_df['Acceleration'] + train_df['SprintSpeed'] + train_df['Agility'] + train_df['Reactions'] + train_df['Balance'] + train_df['Jumping'] + train_df['Stamina'] + train_df['Strength']) / 8
test_df['Physical_ability'] = (test_df['Acceleration'] + test_df['SprintSpeed'] + test_df['Agility'] + test_df['Reactions'] + test_df['Balance'] + test_df['Jumping'] + test_df['Stamina'] + test_df['Strength']) / 8
overall = [ 'Crossing',
'Finishing', 'HeadingAccuracy', 'ShortPassing', 'Volleys', 'Dribbling',
'Curve', 'FKAccuracy', 'LongPassing', 'BallControl', 'Acceleration',
'SprintSpeed', 'Agility', 'Reactions', 'Balance', 'ShotPower',
'Jumping', 'Stamina', 'Strength', 'LongShots', 'Aggression',
'Interceptions', 'Positioning', 'Vision', 'Penalties', 'Composure',
'Marking', 'StandingTackle', 'SlidingTackle', 'GKDiving', 'GKHandling',
'GKKicking', 'GKPositioning', 'GKReflexes', 'GKRating']
train_df['Overall_ability'] = train_df[overall].sum(axis = 1)
test_df['Overall_ability'] = test_df[overall].sum(axis =1)
Python
복사