Search

데이콘 경진대회 전처리 코드 해석

class KFoldDataset(BaseKFoldDataset): X: pd.DataFrame # 데이터 형태 정의 y: pd.Series X_predict: pd.DataFrame use_oversampling: bool = False # default값 정의 use_undersampling: bool = False seed: int = 2023 def setup(self) -> None: # 스케일링(반환값은 없음) self.X, self.X_predict = minmax_scaling(self.X, self.X_predict) def setup_folds(self, num_folds: int) -> None: #교차검증을 위한 데이터 setup kfold = StratifiedKFold(num_folds, shuffle=True, random_state=self.seed) self.splits = [split for split in kfold.split(self.X, self.y)] def setup_fold_index(self, fold_index: int) -> None: # 교차검증을 위한 데이터 추출(train, valid) train_indices, val_indices = self.splits[fold_index] self.X_train, self.y_train = self.X.iloc[train_indices], self.y.iloc[train_indices] self.X_val, self.y_val = self.X.iloc[val_indices], self.y.iloc[val_indices] def setup_fold_end(self) -> None: # 데이터 불균형 처리 if self.use_oversampling: # oversampling value = self.y_train.value_counts().get(1) # 클래스1의 샘플수(가장 많은 값) smote = SMOTE(sampling_strategy={3:value}, random_state=self.seed) self.X_train, self.y_train = smote.fit_resample(self.X_train, self.y_train) if self.use_undersampling: # undersampling value = self.y_train.value_counts().get(13) # 가장 적은 값 under_sampler = RandomUnderSampler(sampling_strategy={0:value, 7:value}, random_state=self.seed) self.X_train, self.y_train = under_sampler.fit_resample(self.X_train, self.y_train) ...
Python
복사
kfold = StratifiedKFold(n_splits=5, random_state=41, shuffle=True) # 5겹 교차검증 test_probs_fold = [] for train_index, valid_index in kfold.split(train_x_raw, train_y_raw): train_x_fold, train_y_fold = train_x_raw.iloc[train_index], train_y_raw.iloc[train_index] valid_x_fold, valid_y_fold = train_x_raw.iloc[valid_index], train_y_raw.iloc[valid_index] ### step1 ### # scaling scaler_ = MinMaxScaler() # x 표준화 scaler_.fit(train_x_fold) train_x_sc = pd.DataFrame(scaler_.transform(train_x_fold), columns=train_x_fold.columns, index=train_x_fold.index) unlabeled_x_sc = pd.DataFrame(scaler_.transform(unlabeled_x), columns=unlabeled_x.columns, index=unlabeled_x.index) # 라벨 없는 데이터에 대해서도 동일하게 probs = [] for i in range(0, 10): # resampling sampler_ = RandomUnderSampler(random_state=i) # 데이터 불균형 처리 'auto' : 알아서 소수 클래스 크기만큼 다수 클래스들의 크기를 줄임 train_x_rs, train_y_rs = sampler_.fit_resample(train_x_sc, train_y_fold) # training model = MLPClassifier(random_state=8, hidden_layer_sizes=(12,)) # 모델 정의 model.fit(train_x_rs, train_y_rs) # 모델 학습 # inference for pseudo-labeling probs.append(model.predict_proba(unlabeled_x_sc)) preds = np.where(np.mean(np.array(probs)[:,:,1], axis=0) < 0.75, 0, 1) # 임겟값 : 0.75, 0.75 미만이면 0, 이상이면 1인 배열 반환
Python
복사
train = pd.read_csv("data/klue/open/train_data.csv") df_valid = pd.read_csv("data/klue/open/test_data.csv") titles_t = train.title.to_numpy().reshape(-1,1) labels_t = train.topic_idx.to_numpy().reshape(-1,1) oversample = RandomOverSampler() # 오버샘플링 X_over, y_over = oversample.fit_resample(titles_t, labels_t) train = pd.DataFrame({'title':X_over.reshape(-1), 'topic_idx':y_over.reshape(-1)})
Python
복사