一、数据处理
1、数据源包含train数据集和test数据集
2、train数据集包含891条乘客信息,test数据集包含418条乘客信息
3、数据包含以下信息:PassengerId, Survived, Pclass, Name, Sex, Age, SibSp, Parch, Ticket, Fare, Cabin和Embarked
4、数据预处理包括:缺失值填充、数据清洗、特征选择
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
# 读取数据
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
# 数据预处理
def data_preprocessing(df):
# 缺失值填充
df['Age'] = df['Age'].fillna(df['Age'].mean())
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode().iloc[0])
df['Fare'] = df['Fare'].fillna(df['Fare'].mean())
# 数据清洗
df.drop(['Cabin', 'Ticket', 'Name'], axis=1, inplace=True)
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
df['Embarked'] = df['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})
# 特征选择
features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
x = df[features]
y = df['Survived']
return x, y
x_train, y_train = data_preprocessing(train_data)
x_test, y_test = data_preprocessing(test_data)
二、特征分析
1、数据中包含12个特征,Survived为目标变量
2、Sex和Pclass对Survived的影响较大
3、Cabin、Name和Ticket对Survived的影响较小
4、SibSp和Parch这两个特征可以合并为一个特征Family_Size
5、Fare和Age可以分别按照数值大小和区间进行分组
# 合并SibSp和Parch
train_data['Family_Size'] = train_data['SibSp'] + train_data['Parch'] + 1
test_data['Family_Size'] = test_data['SibSp'] + test_data['Parch'] + 1
# Fare按照数值大小分组
train_data['Fare_Group'] = pd.cut(x=train_data['Fare'], bins=[0, 20, 40, 60, 80, 1000])
test_data['Fare_Group'] = pd.cut(x=test_data['Fare'], bins=[0, 20, 40, 60, 80, 1000])
# Age按照区间分组
train_data['Age_Group'] = pd.cut(x=train_data['Age'], bins=[0, 12, 18, 30, 50, 100])
test_data['Age_Group'] = pd.cut(x=test_data['Age'], bins=[0, 12, 18, 30, 50, 100])
# 特征分析
sns.barplot(x='Sex', y='Survived', data=train_data)
plt.show()
sns.barplot(x='Pclass', y='Survived', data=train_data)
plt.show()
sns.barplot(x='Family_Size', y='Survived', data=train_data)
plt.show()
sns.barplot(x='Fare_Group', y='Survived', data=train_data)
plt.show()
sns.barplot(x='Age_Group', y='Survived', data=train_data)
plt.show()
三、模型训练
1、使用逻辑回归、KNN、决策树、随机森林、GBDT等常见模型进行分类预测
2、使用交叉验证法选取最优模型,并进行模型融合
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import VotingClassifier
# 模型定义
models = [
LogisticRegression(),
KNeighborsClassifier(),
DecisionTreeClassifier(),
RandomForestClassifier(),
GradientBoostingClassifier()
]
# 交叉验证找出最优模型
for model in models:
score = cross_val_score(model, x_train, y_train, cv=5, scoring='accuracy')
print('Model: {}, Score: {} (+/- {})'.format(model.__class__.__name__, score.mean(), score.std()))
# 模型融合
voting_classifier = VotingClassifier(estimators=[
('Logistic Regression', LogisticRegression()),
('K Nearest Neighbor', KNeighborsClassifier()),
('Decision Tree', DecisionTreeClassifier()),
('Random Forest', RandomForestClassifier()),
('Gradient Boosting', GradientBoostingClassifier())
], voting='soft')
voting_classifier.fit(x_train, y_train)
print('Voting Classifier:', voting_classifier.score(x_test, y_test))
四、模型调优
1、GridSearchCV进行参数调优以提高模型准确度
from sklearn.model_selection import GridSearchCV
# 随机森林分类器调优
param_grid = {
'n_estimators': [400, 500, 600, 700, 800],
'max_features': [3, 4, 5, 6, 7],
'criterion': ['gini', 'entropy']
}
rfc = RandomForestClassifier(random_state=123)
grid_search = GridSearchCV(rfc, param_grid, cv=5, scoring='accuracy')
grid_search.fit(x_train, y_train)
print('Best Parameters:', grid_search.best_params_)
print('Best Score:', grid_search.best_score_)
五、模型评估
1、使用confusion_matrix、classification_report、ROC曲线等指标对模型进行评估
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc
# 在训练数据上验证模型
y_pred_train = voting_classifier.predict(x_train)
cm_train = confusion_matrix(y_train, y_pred_train)
report_train = classification_report(y_train, y_pred_train)
print('Train Confusion Matrix:\n', cm_train)
print('Train Classification Report:\n', report_train)
# 在测试数据上验证模型
y_pred_test = voting_classifier.predict(x_test)
cm_test = confusion_matrix(y_test, y_pred_test)
report_test = classification_report(y_test, y_pred_test)
print('Test Confusion Matrix:\n', cm_test)
print('Test Classification Report:\n', report_test)
# ROC曲线和AUC值
def plot_roc_curve(model, x, y):
y_pred = model.predict_proba(x)[:, 1]
fpr, tpr, _ = roc_curve(y, y_pred)
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, label='ROC Curve (area = {:.2f})'.format(roc_auc))
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0, 1])
plt.ylim([0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.show()
plot_roc_curve(voting_classifier, x_test, y_test)
原创文章,作者:JUYET,如若转载,请注明出处:https://www.506064.com/n/369414.html