!pipinstallcategory_encoders# 카테고리 인코더스 설치
!pipinstallpandas-profiling==2.11.0# 프로파일링 설치
importpandasaspdfromsklearn.model_selectionimporttrain_test_splittarget='vacc_h1n1_f'# 타겟 설정
train=pd.merge(pd.read_csv('train.csv'),pd.read_csv('train_labels.csv')[target],left_index=True,right_index=True)test=pd.read_csv('test.csv')train.head().T
# 특성공학
importnumpyasnpdefengineer(df):# 높은 카디널리티를 가지는 특성을 제거
selected_cols=df.select_dtypes(include=['number','object'])labels=selected_cols.nunique()# 특성별 카디널리티 리스트
selected_features=labels[labels<=30].index.tolist()# 카디널리티가 30보다 작은 특성만 선택
df=df[selected_features]# 새로운 특성을 생성
behaviorals=[colforcolindf.columnsif'behavioral'incol]df['behaviorals']=df[behaviorals].sum(axis=1)dels=[colforcolindf.columnsif('employment'incolor'seas'incol)]df.drop(columns=dels,inplace=True)returndftrain=engineer(train)val=engineer(val)test=engineer(test)# 피쳐에서 타겟 드롭
features=train.drop(columns=[target]).columns# 훈련/검증/테스트 데이터를 특성과 타겟으로 분리
X_train=train[features]y_train=train[target]X_val=val[features]y_val=val[target]X_test=test[features]
fromsklearn.treeimportDecisionTreeClassifierpipe=make_pipeline(OneHotEncoder(use_cat_names=True),SimpleImputer(),DecisionTreeClassifier(random_state=1,criterion='entropy'))pipe.fit(X_train,y_train)print('훈련 정확도: ',pipe.score(X_train,y_train))print('검증 정확도: ',pipe.score(X_val,y_val))'''
훈련 정확도: 0.9908667674880646
검증 정확도: 0.7572055509429486
과적합 상태라고 볼 수 있다.
'''# 시각화
importgraphvizfromsklearn.treeimportexport_graphvizmodel_dt=pipe.named_steps['decisiontreeclassifier']enc=pipe.named_steps['onehotencoder']encoded_columns=enc.transform(X_val).columnsdot_data=export_graphviz(model_dt,max_depth=3,feature_names=encoded_columns,class_names=['no','yes'],filled=True,proportion=True)display(graphviz.Source(dot_data))
fromsklearn.metricsimportf1_scorepipe=make_pipeline(OneHotEncoder(use_cat_names=True),SimpleImputer(),DecisionTreeClassifier(max_depth=7,random_state=2)# depth 변경
)pipe.fit(X_train,y_train)print('훈련 정확도',pipe.score(X_train,y_train))print('검증 정확도',pipe.score(X_val,y_val))# f1 score 계산
fromsklearn.metricsimportf1_scorepred=pipe.predict(X_val)print('f1 스코어',f1_score(y_val,pred))'''
훈련 정확도 0.8317468789846693
검증 정확도 0.8254062388803226
f1 스코어 0.551219512195122
'''# imputer median 사용
pipe=make_pipeline(OneHotEncoder(use_cat_names=True),SimpleImputer(strategy='median'),DecisionTreeClassifier(max_depth=3,random_state=2)# depth 값 조절
)pipe.fit(X_train,y_train)print('훈련 정확도',pipe.score(X_train,y_train))print('검증 정확도',pipe.score(X_val,y_val))pred=pipe.predict(X_val)print('f1 스코어',f1_score(y_val,pred))'''
훈련 정확도 0.7994247249651573
검증 정확도 0.8054797770134029
f1 스코어 0.5866935483870968
'''# 시각화
model_dt=pipe.named_steps['decisiontreeclassifier']importances=pd.Series(model_dt.feature_importances_,encoded_columns)plt.figure(figsize=(10,30))importances.sort_values().plot.barh();
Kaggle Submit
1 2 3 4 5 6 7 8 9 10
# 테스트 학습
tpred=pipe.predict(X_test)# 제출 양식 생성
submission=pd.read_csv('submission.csv')submission['vacc_h1n1_f']=tpredsubmission# file export
submission.to_csv('submission.csv',index=False)