Search

분류 연습 - 축구선수의 유망 여부 예측 AI 해커톤

필요한 라이브러리 불러오기

import pandas as pd import numpy as np import math import stats import seaborn as sns import matplotlib as mpl import matplotlib.pyplot as plt %matplotlib inline import warnings warnings.filterwarnings('ignore') from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, StratifiedKFold, cross_val_score from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler # ensemble from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, \ ExtraTreesClassifier, GradientBoostClassifier, VotingClassifier # lightgbm import lightgbm as lgb # xgboost import xgboost as xgb # svc from sklearn.svm import SVC # knn from sklearn.neighbors import KNeighborsClassifier # tree from sklearn.tree import DecisionTreeClassifier # linear from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score, f1_score, \ precision_score, recall_score, roc_auc_score, confusion_matrix, classification_report
Python
복사

baseline 모델 & cross validation

def basic_model_evaluation(X:pd.DataFrame, y:pd.DataFrame, eval_metric:str, cv:int, random_state=42) classifiers = [ KNeighborsClassifier(7), # 임의로 설정 SVC(probability=True, random_state=random_state), DecisionTreeClassifier(random_state=random_state), AdaBoostingClassifier(random_state=random_state), GradientBoostingClassifier(random_state=random_state), ExtraTreesClassifier(random_state=random_state), lgb.LGBMClassifier(random_state=random_state), xgb.XGBClassifier(random_state=random_state), LogisticRegression(random_state=random_state) ] for model in classifiers: try: cv_score = cross_val_score(model, X, y, scoring=eval_metric, cv=cv) print('model: {0}의 f1_score(mean,std) : ({1:.4f},{2:.4f})'.format(model.__class__.__name__, cv_score.mean(), cv_score.std())) except: print('model: {0}의 f1_score 측정 불가!!!'.format(model.__class__.__name__))
Python
복사

불균형을 반영한 cross validation 함수

train[CFG.target].value_counts(normalize=True)
Python
복사
타겟 변수가 불균형이라는 점을 감안해서 predict_proba 확률이 0.36 이상이면 1로 예측하도록 조정하고, cross-validation 을 구하는 함수를 만든다.
def skf_fold_evaluation(X:pd.DataFrame, y, random_state=42): skf = StratifiedKFold(n_splits=10) n_iter=0 classifiers = [ KNeighborsClassifier(7), SVC(probability=True, random_state=random_state), DecisionTreeClassifier(random_state=random_state), AdaBoostingClassifier(random_state=random_state), GradientBoostingClassifier(random_state=random_state), ExtraTreesClassifier(random_state=random_state), lgb.LGBMClassifier(random_state=random_state), xgb.XGBClassifier(random_state=random_state), LogisticRegression(random_state=random_state) ] for model in classifiers: score_list = np.zeros(10) score_f1_list = np.zeros(10) score_precision_list = np.zeros(10) score_recall_list = np.zeros(10) score_roc_auc_list = np.zeros(10) n_iter = 0 for train_index, test_index in skf.split(X,y): n_iter += 1 X_train, X_test = X.iloc[train_index], X.iloc[test_index] y_train, y_test = y.iloc[train_index], y.iloc[test_index] model.fit(X_train, y_train) model_pred = model.predict(X_test)[:,1] >= 0.36 fold_accuracy = accuracy_score(y_test, model_pred) fold_f1_score = f1_score(y_test, model_pred, average='macro') fold_precision_score = precision_score(y_test, model_pred) fold_recall_score = recall_score(y_test, model_pred) fold_roc_auc_score = roc_auc_score(y_test, model_pred) score_list[n_iter - 1] = fold_accuracy score_f1_list[n_iter - 1] = fold_f1_accuracy score_precision_list[n_iter - 1] = fold_precision_score score_recall_list[n_iter - 1] = fold_recall_score score_roc_auc_list[n_iter - 1] = fold_roc_auc_score print('='*20 + 'model:', model.__class__.__name__ + '='*20) print('(acc,f1,pre,recall,roc)=({0:.4f},{1:.4f},{2:.4f},{3:.4f},{4:.4f})'.format(score_list.mean(), score_f1_list.mean(), score_precision_list.mean(), score_recall_list.mean(), score_roc_auc_list.mean()))
Python
복사

기초 EDA 분석 함수

def data_info(df:pd.DataFrame) -> None: print('column 개수:', len(df.columns)) print('데이터 개수:', len(df)) print('데이터의 shape:', df.shape) print('categorical 변수의 개수:', len(df.columns[df.dtypes=='object'])) print('categorical 컬럼:', df.columns[df.dtypes=='object']) print('numerical 변수의 개수:', len(df.columns[df.dtypes!='object'])) print('결측치 개수:', df.isnull().sum().sum()) print('='*50) print('컬럼 이름:',df.columns)
Python
복사
변수 개수, 변수 이름, 데이터 개수, shape, 범주형 변수 개수, 수치형 변수 개수, 결측치 개수

더 다양한 EDA 함수

# 범주형 컬럼 분포, 타겟 변수 비율 확인 def search_columns(df:pd.DataFrame, columns:str, target:str) -> None: print('컬럼명:', columns) print('컬럼 분포 ', df[columns].value_counts(normalize=True).sort_values(ascending=True)) print('범주 별 target 비율:', df.groupby(columns)[target].mean()) # 컬럼별 타겟 변수 분포 시각화 def search_target_ratio(df:pd.DataFrame, columns:str, target:str) -> None: return pd.crosstab(df[columns], df[target], margins=True, normalize='index').style.background_gradient(cmap='summer_r') # 연속형 변수에 대한 plot 그리기 def numerical_plot(df:pd.DataFrame, columns:list, nrows:int, ncols:int, row_size:int, col_size:int) -> None: fig, axes = plt.subplots(nrows, ncols, figsize=(row_size,col_size)) count=0 for i in range(nrows): for j in range(ncols): sns.distplot(x=df[columns[count]],ax=axes[i][j]) axes[i][j].set_title(columns[count] + 'column distribution') axes[i][j].set_xlabel(columns[count]) count += 1 def numerical_target_group_ratio(df:pd.DataFrame, columns:list, nrows:int, ncols:int, row_size:int, col_size:int) -> None: fig, axes = plt.subplots(nrows, ncols, figsize=(row_size,col_size)) count = 0 for i in range(nrows): for j in range(ncols): counts = df.groupby(columns[count])[columns[-1]].value_counts(normalize=True).rename('percentage').mul(100).reset_index() sns.barplot(x=columns[count], y='percentage', hue=columns[-1], data=counts, ax=axes[i][j]) axes[i][j].set_title(columns[count] + 'distribution') axes[i][j].set_xlabel(columns[count]) axes[i][j].set_xticks([0,2.5,5,7.5,10]) count += 1 # 상관계수 히트맵 그리기 def correlation_matrix(df:pd.DataFrame, numerical_columns:list, figsize:tuple) -> None: colormap = plt.cm.PuBu plt.title('Pearson Correlation of data') plt.figure(figsize=figsize) if len(numerical_columns) <= 20: sns.heatmap(df[numerical_columns].astype(float).corr(), linewidth=0.1, vmax=1.0,\ square=True, cmap=colormap, linecolor='white', annot=True, annot_kws={'size':16}) else: sns.heatmap(df[numerical_columns].astype(float).corr(), linewidth=0.1, vmax=1.0,\ square=True, cmap=colormap) def describe_matrix(df:pd.DataFrame) -> pd.DataFrame: return df.describe()
Python
복사

Data Preprocessing 및 Scaling

# position Obejct 컬럼 축소 def position_convert(x:str) -> str: attack=['ST','RW','CF','LW'] midfield=['CDM','CAM','CM','RM','LM'] defend=['CB','RB','LB','RWB','LWB'] goalkeeper=['GK'] if x in attack: return 'FW' elif x in midfield: return 'MF' elif x in defend: return 'DF' else: return 'GK' # 범주형 변수에 대하여 라벨인코딩 진행 def categorical_column_preprocessing(train_df:pd.DataFrame, test_df:pd.DataFrame, columns:list) -> (pd.DataFrame, pd.DataFrame): for column in columns: le = LabelEncoder() le.fit(train_df[column]) print(column + 'label encoder classes', le.classes_) train_df[column] = le.transform(train_df[column]) test_df[column] = le.transform(test_df[column]) return (train_df, test_df) # 연속형 변수에 대해 표준화 진행 def standard_scaler(train_df:pd.DataFrame, test_df:pd.DataFrame, categorical_columns, numeric_columns:list) -> (pd.DataFrame, pd.DataFrame): scaler = StandardScaler() scaler.fit(train_df[numeric_columns]) x_scaled = pd.DataFrame(scaler.transform(train_df[numeric_columns]), columns=numeric_columns) x_test_scaled = pd.DataFrame(scaler.transform(test_df[numeric_columns]), columns=numeric_columns) train_data = pd.concat([train_df[categorical_columns], x_scaled], axis=1) test_data = pd.concat([test_df[categorical_columns], x_test_scaled], axis=1) return (train_data, test_data) def minmax_scaler(train_df:pd.DataFrame, test_df:pd.DataFrame, categorical_columns, numeric_columns:list) -> (pd.DataFrame, pd.DataFrame): scaler = MinMaxScaler() scaler.fit(train_df[numeric_columns]) x_scaled = pd.DataFrame(scaler.transform(train_df[numeric_columns]), columns=numeric_columns) x_test_scaled = pd.DataFrame(scaler.transform(test_df[numeric_columns]), columns=numeric_columns) train_data = pd.concat([train_df[categorical_columns], x_scaled], axis=1) test_data = pd.concat([test_df[categorical_columns], x_test_scaled], axis=1) return (train_data, test_data) # 학습데이터와 테스트데이터 비율 나누기 def train_test(X:pd.DataFrame, y:pd.DataFrame, test_size:float, random_state=42) -> (pd.DataFrame,pd.DataFrame,pd.DataFrame,pd.DataFrame): X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=y) return X_train, X_valid, y_train, y_valid
Python
복사

파일 불러오기

  TIP 1  
불러올 때 필요없는 변수, 중복 모두 제거하고 index도 초기화한다
train_df = pd.read_csv('train.csv').drop('ID', axis=1).drop_duplicates().reset_index(drop=True) test_df = pd.read_csv('test.csv').drop('ID', axis=1) submission_df = pd.read_csv('sample_submission.csv')
Python
복사

파생변수 생성 및 전처리

train_df['Position'].apply(lambda x: position_convert(x)) test_df['Position'].apply(lambda x: position_convert(x)) # 나이 범주 변수 추가 train_df['age_convert'] = train_df['Age'].apply(lambda x: ('young' if x < 20 else 'adult')) test_df['age_convert'] = test_df['Age'].apply(lambda x: ('young' if x < 20 else 'adult')) # 키 범주 생성 train_df['Height_cat'] = 0 test_df['Height_cat'] = 0 train_df.loc[train_df['Height'] <= 176.0, 'Height_cat'] = 1 test_df.loc[test_df['Height'] <= 176.0, 'Height_cat'] = 1 train_df.loc[(train_df['Height'] > 176.0)&(train_df['Height'] <= 180.0),'Height_cat'] = 2 test_df.loc[(test_df['Height'] > 176.0)&(test_df['Height'] <= 180.0),'Height_cat'] = 2 train_df.loc[(train_df['Height'] > 180.0) & (train_df['Height'] <= 185.75), 'Height_cat'] = 3 test_df.loc[(test_df['Height'] > 180.0) & (test_df['Height'] <= 185.75), 'Height_cat'] = 3 train_df.loc[(train_df['Height'] > 185.75), 'Height_cat'] = 4 test_df.loc[(test_df['Height'] > 185.75), 'Height_cat'] = 4 # 몸무게 범주 생성 train_df['Weight_cat'] = 0 test_df['Weight_cat'] = 0 train_df.loc[train_df['Weight'] <= 70.0, 'Weight_cat'] = 1 test_df.loc[test_df['Weight'] <= 70.0, 'Weight_cat'] = 1 train_df.loc[(train_df['Weight'] > 70.0)&(train_df['Weight'] <= 75.0),'Weight_cat'] = 2 test_df.loc[(test_df['Weight'] > 70.0)&(test_df['Weight'] <= 75.0),'Weight_cat'] = 2 train_df.loc[(train_df['Weight'] > 75.0), 'Weight_cat'] = 3 test_df.loc[(test_df['Weight'] > 75.0), 'Weight_cat'] = 3 # 피처 다중공선성 띄는 변수들 합쳐서 새로운 파생 변수 생성 train_df['GK_ability'] = (train_df['GKRating'] + train_df['GKHandling'] + train_df['GKKicking'] + train_df['GKPositioning'] + train_df['GKReflexes']) / 5 test_df['GK_ability'] = (test_df['GKRating'] + test_df['GKHandling'] + test_df['GKKicking'] + test_df['GKPositioning'] + test_df['GKReflexes']) / 5 train_df['FW_ability'] = (train_df['ShotPower'] + train_df['LongShots'] + train_df['Aggression'] + train_df['Volleys'] + train_df['Finishing'] + train_df['HeadingAccuracy'] + train_df['Dribbling']+ train_df['Curve'] + train_df['Crossing']) / 9 test_df['FW_ability'] = (test_df['ShotPower'] + test_df['LongShots'] + test_df['Aggression'] + test_df['Volleys'] + test_df['Finishing'] + test_df['HeadingAccuracy'] + test_df['Dribbling']+ test_df['Curve'] + test_df['Crossing']) / 9 train_df['MF_ability'] = (train_df['Crossing'] + train_df['ShortPassing'] + train_df['Dribbling'] + train_df['Curve'] + train_df['LongPassing'] + train_df['BallControl'] + train_df['Positioning'] + train_df['Vision'] + train_df['Composure']) / 9 test_df['MF_ability'] = (test_df['Crossing'] + test_df['ShortPassing'] + test_df['Dribbling'] + test_df['Curve'] + test_df['LongPassing'] + test_df['BallControl'] + test_df['Positioning'] + test_df['Vision'] + test_df['Composure']) / 9 train_df['DF_ability'] = (train_df['Marking'] + train_df['StandingTackle'] + train_df['SlidingTackle']) / 3 test_df['DF_ability'] = (test_df['Marking'] + test_df['StandingTackle'] + test_df['SlidingTackle'])/ 3 train_df['DFRating'] = (train_df['LWBRating'] + train_df['LBRating'] + train_df['CBRating'] + train_df['RBRating'] + train_df['RWBRating']) /5 test_df['DFRating'] = (test_df['LWBRating'] + test_df['LBRating'] + test_df['CBRating'] + test_df['RBRating'] + test_df['RWBRating']) /5 train_df['MFRating'] = (train_df['CAMRating'] + train_df['LMRating'] + train_df['CMRating'] + train_df['RMRating'] + train_df['CDMRating']) /5 test_df['MFRating'] = (test_df['CAMRating'] + test_df['LMRating'] + test_df['CMRating'] + test_df['RMRating'] + test_df['CDMRating']) /5 train_df['FWRating'] = (train_df['STRating'] + train_df['LWRating'] + train_df['LFRating'] + train_df['CFRating'] + train_df['RFRating'] + train_df['RWRating']) /6 test_df['FWRating'] = (test_df['STRating'] + test_df['LWRating'] + test_df['LFRating'] + test_df['CFRating'] + test_df['RFRating'] + test_df['RWRating']) /6 train_df['Maximum_Rating_Position'] = train_df[['DFRating', 'FWRating', 'GKRating', 'MFRating']].idxmax(axis = 1) test_df['Maximum_Rating_Position'] = test_df[['DFRating', 'FWRating', 'GKRating', 'MFRating']].idxmax(axis = 1) train_df['Physical_ability'] = (train_df['Acceleration'] + train_df['SprintSpeed'] + train_df['Agility'] + train_df['Reactions'] + train_df['Balance'] + train_df['Jumping'] + train_df['Stamina'] + train_df['Strength']) / 8 test_df['Physical_ability'] = (test_df['Acceleration'] + test_df['SprintSpeed'] + test_df['Agility'] + test_df['Reactions'] + test_df['Balance'] + test_df['Jumping'] + test_df['Stamina'] + test_df['Strength']) / 8 overall = [ 'Crossing', 'Finishing', 'HeadingAccuracy', 'ShortPassing', 'Volleys', 'Dribbling', 'Curve', 'FKAccuracy', 'LongPassing', 'BallControl', 'Acceleration', 'SprintSpeed', 'Agility', 'Reactions', 'Balance', 'ShotPower', 'Jumping', 'Stamina', 'Strength', 'LongShots', 'Aggression', 'Interceptions', 'Positioning', 'Vision', 'Penalties', 'Composure', 'Marking', 'StandingTackle', 'SlidingTackle', 'GKDiving', 'GKHandling', 'GKKicking', 'GKPositioning', 'GKReflexes', 'GKRating'] train_df['Overall_ability'] = train_df[overall].sum(axis = 1) test_df['Overall_ability'] = test_df[overall].sum(axis =1)
Python
복사