泰坦尼克号生存预测

一、数据处理

1、数据源包含train数据集和test数据集

2、train数据集包含891条乘客信息,test数据集包含418条乘客信息

3、数据包含以下信息:PassengerId, Survived, Pclass, Name, Sex, Age, SibSp, Parch, Ticket, Fare, Cabin和Embarked

4、数据预处理包括:缺失值填充、数据清洗、特征选择

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# 读取数据
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

# 数据预处理
def data_preprocessing(df):
    # 缺失值填充
    df['Age'] = df['Age'].fillna(df['Age'].mean())
    df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode().iloc[0])
    df['Fare'] = df['Fare'].fillna(df['Fare'].mean())

    # 数据清洗
    df.drop(['Cabin', 'Ticket', 'Name'], axis=1, inplace=True)
    df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})
    df['Embarked'] = df['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})

    # 特征选择
    features = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']
    x = df[features]
    y = df['Survived']
    return x, y

x_train, y_train = data_preprocessing(train_data)
x_test, y_test = data_preprocessing(test_data)

二、特征分析

1、数据中包含12个特征,Survived为目标变量

2、Sex和Pclass对Survived的影响较大

3、Cabin、Name和Ticket对Survived的影响较小

4、SibSp和Parch这两个特征可以合并为一个特征Family_Size

5、Fare和Age可以分别按照数值大小和区间进行分组

# 合并SibSp和Parch
train_data['Family_Size'] = train_data['SibSp'] + train_data['Parch'] + 1
test_data['Family_Size'] = test_data['SibSp'] + test_data['Parch'] + 1

# Fare按照数值大小分组
train_data['Fare_Group'] = pd.cut(x=train_data['Fare'], bins=[0, 20, 40, 60, 80, 1000])
test_data['Fare_Group'] = pd.cut(x=test_data['Fare'], bins=[0, 20, 40, 60, 80, 1000])

# Age按照区间分组
train_data['Age_Group'] = pd.cut(x=train_data['Age'], bins=[0, 12, 18, 30, 50, 100])
test_data['Age_Group'] = pd.cut(x=test_data['Age'], bins=[0, 12, 18, 30, 50, 100])

# 特征分析
sns.barplot(x='Sex', y='Survived', data=train_data)
plt.show()

sns.barplot(x='Pclass', y='Survived', data=train_data)
plt.show()

sns.barplot(x='Family_Size', y='Survived', data=train_data)
plt.show()

sns.barplot(x='Fare_Group', y='Survived', data=train_data)
plt.show()

sns.barplot(x='Age_Group', y='Survived', data=train_data)
plt.show()

三、模型训练

1、使用逻辑回归、KNN、决策树、随机森林、GBDT等常见模型进行分类预测

2、使用交叉验证法选取最优模型,并进行模型融合

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import VotingClassifier

# 模型定义
models = [
    LogisticRegression(),
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    GradientBoostingClassifier()
]

# 交叉验证找出最优模型
for model in models:
    score = cross_val_score(model, x_train, y_train, cv=5, scoring='accuracy')
    print('Model: {}, Score: {} (+/- {})'.format(model.__class__.__name__, score.mean(), score.std()))

# 模型融合
voting_classifier = VotingClassifier(estimators=[
    ('Logistic Regression', LogisticRegression()),
    ('K Nearest Neighbor', KNeighborsClassifier()),
    ('Decision Tree', DecisionTreeClassifier()),
    ('Random Forest', RandomForestClassifier()),
    ('Gradient Boosting', GradientBoostingClassifier())
], voting='soft')
voting_classifier.fit(x_train, y_train)
print('Voting Classifier:', voting_classifier.score(x_test, y_test))

四、模型调优

1、GridSearchCV进行参数调优以提高模型准确度

from sklearn.model_selection import GridSearchCV

# 随机森林分类器调优
param_grid = {
    'n_estimators': [400, 500, 600, 700, 800],
    'max_features': [3, 4, 5, 6, 7],
    'criterion': ['gini', 'entropy']
}
rfc = RandomForestClassifier(random_state=123)
grid_search = GridSearchCV(rfc, param_grid, cv=5, scoring='accuracy')
grid_search.fit(x_train, y_train)
print('Best Parameters:', grid_search.best_params_)
print('Best Score:', grid_search.best_score_)

五、模型评估

1、使用confusion_matrix、classification_report、ROC曲线等指标对模型进行评估

from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc

# 在训练数据上验证模型
y_pred_train = voting_classifier.predict(x_train)
cm_train = confusion_matrix(y_train, y_pred_train)
report_train = classification_report(y_train, y_pred_train)
print('Train Confusion Matrix:\n', cm_train)
print('Train Classification Report:\n', report_train)

# 在测试数据上验证模型
y_pred_test = voting_classifier.predict(x_test)
cm_test = confusion_matrix(y_test, y_pred_test)
report_test = classification_report(y_test, y_pred_test)
print('Test Confusion Matrix:\n', cm_test)
print('Test Classification Report:\n', report_test)

# ROC曲线和AUC值
def plot_roc_curve(model, x, y):
    y_pred = model.predict_proba(x)[:, 1]
    fpr, tpr, _ = roc_curve(y, y_pred)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label='ROC Curve (area = {:.2f})'.format(roc_auc))
    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0, 1])
    plt.ylim([0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend(loc='lower right')
    plt.show()

plot_roc_curve(voting_classifier, x_test, y_test)

原创文章,作者:JUYET,如若转载,请注明出处:https://www.506064.com/n/369414.html

(0)
打赏 微信扫一扫 微信扫一扫 支付宝扫一扫 支付宝扫一扫
JUYETJUYET
上一篇 2025-04-12 13:01
下一篇 2025-04-12 13:01

发表回复

登录后才能评论