class KFoldDataset(BaseKFoldDataset):
X: pd.DataFrame # 데이터 형태 정의
y: pd.Series
X_predict: pd.DataFrame
use_oversampling: bool = False # default값 정의
use_undersampling: bool = False
seed: int = 2023
def setup(self) -> None: # 스케일링(반환값은 없음)
self.X, self.X_predict = minmax_scaling(self.X, self.X_predict)
def setup_folds(self, num_folds: int) -> None: #교차검증을 위한 데이터 setup
kfold = StratifiedKFold(num_folds, shuffle=True, random_state=self.seed)
self.splits = [split for split in kfold.split(self.X, self.y)]
def setup_fold_index(self, fold_index: int) -> None: # 교차검증을 위한 데이터 추출(train, valid)
train_indices, val_indices = self.splits[fold_index]
self.X_train, self.y_train = self.X.iloc[train_indices], self.y.iloc[train_indices]
self.X_val, self.y_val = self.X.iloc[val_indices], self.y.iloc[val_indices]
def setup_fold_end(self) -> None: # 데이터 불균형 처리
if self.use_oversampling: # oversampling
value = self.y_train.value_counts().get(1) # 클래스1의 샘플수(가장 많은 값)
smote = SMOTE(sampling_strategy={3:value}, random_state=self.seed)
self.X_train, self.y_train = smote.fit_resample(self.X_train, self.y_train)
if self.use_undersampling: # undersampling
value = self.y_train.value_counts().get(13) # 가장 적은 값
under_sampler = RandomUnderSampler(sampling_strategy={0:value, 7:value}, random_state=self.seed)
self.X_train, self.y_train = under_sampler.fit_resample(self.X_train, self.y_train)
...
Python
복사
kfold = StratifiedKFold(n_splits=5, random_state=41, shuffle=True) # 5겹 교차검증
test_probs_fold = []
for train_index, valid_index in kfold.split(train_x_raw, train_y_raw):
train_x_fold, train_y_fold = train_x_raw.iloc[train_index], train_y_raw.iloc[train_index]
valid_x_fold, valid_y_fold = train_x_raw.iloc[valid_index], train_y_raw.iloc[valid_index]
### step1 ###
# scaling
scaler_ = MinMaxScaler() # x 표준화
scaler_.fit(train_x_fold)
train_x_sc = pd.DataFrame(scaler_.transform(train_x_fold), columns=train_x_fold.columns, index=train_x_fold.index)
unlabeled_x_sc = pd.DataFrame(scaler_.transform(unlabeled_x), columns=unlabeled_x.columns, index=unlabeled_x.index) # 라벨 없는 데이터에 대해서도 동일하게
probs = []
for i in range(0, 10):
# resampling
sampler_ = RandomUnderSampler(random_state=i) # 데이터 불균형 처리 'auto' : 알아서 소수 클래스 크기만큼 다수 클래스들의 크기를 줄임
train_x_rs, train_y_rs = sampler_.fit_resample(train_x_sc, train_y_fold)
# training
model = MLPClassifier(random_state=8, hidden_layer_sizes=(12,)) # 모델 정의
model.fit(train_x_rs, train_y_rs) # 모델 학습
# inference for pseudo-labeling
probs.append(model.predict_proba(unlabeled_x_sc))
preds = np.where(np.mean(np.array(probs)[:,:,1], axis=0) < 0.75, 0, 1) # 임겟값 : 0.75, 0.75 미만이면 0, 이상이면 1인 배열 반환
Python
복사
train = pd.read_csv("data/klue/open/train_data.csv")
df_valid = pd.read_csv("data/klue/open/test_data.csv")
titles_t = train.title.to_numpy().reshape(-1,1)
labels_t = train.topic_idx.to_numpy().reshape(-1,1)
oversample = RandomOverSampler() # 오버샘플링
X_over, y_over = oversample.fit_resample(titles_t, labels_t)
train = pd.DataFrame({'title':X_over.reshape(-1), 'topic_idx':y_over.reshape(-1)})
Python
복사