# 安装必要的包
install.packages(c("ggplot2", "randomForest", "calibrate"))
library(ggplot2) # 用于数据可视化
library(randomForest) # 用于构建随机森林模型
数据获取与加载
# 数据集URL
URL <- "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"
# 读取数据
data <- read.csv(URL, header = FALSE)
# 命名列
names(data) <- c("age", "sex", "cp", "trestbps", "chol", "fbs", "restecg",
"thalach", "exang", "oldpeak", "slope", "ca", "thal", "hd")
# 查看前6行数据
head(data)
数据清洗与预处理
缺失值处理
将缺失值?
替换为NA
。数据类型转换
将部分变量转换为因子类型。
# 替换缺失值
data[data == "?"] <- NA
# 将性别和其他列转换为因子
data$sex <- factor(ifelse(data$sex == 0, "F", "M"))
data$cp <- factor(data$cp)
data$ca <- factor(data$ca)
data$thal <- factor(data$thal)
data$hd <- factor(ifelse(data$hd == 0, "healthy", "unhealthy"))
# 检查数据结构
str(data)
缺失值填补
# 设置随机种子
set.seed(123)
# 使用randomForest填补缺失值
data_imputed <- rfImpute(hd ~ ., data = data, ntree = 20)
# 查看插补后的数据
head(data_imputed)
构建随机森林模型
randomForest
函数构建模型。# 构建随机森林模型
set.seed(123)
model <- randomForest(hd ~ ., data = data_imputed, proximity = TRUE)
# 查看模型摘要
print(model)
输出示例
OOB误差率: 83.5%
混淆矩阵:
树数量的优化
# 提取错误率数据
error_rate <- data.frame(
Trees = 1:500,
Error = model$err.rate[,1]
)
# 绘制误差率图
ggplot(error_rate, aes(x = Trees, y = Error)) +
geom_line(color = "blue") +
labs(title = "OOB误差率随树数量变化", x = "树的数量", y = "OOB误差率")
变量随机抽样的优化
mtry
值(每次分裂所考虑的变量数量),找到最佳参数。# 初始化向量
oob_errors <- numeric(10)
# 测试不同的mtry值
for (i in 1:10) {
temp_model <- randomForest(hd ~ ., data = data_imputed, mtry = i, ntree = 500)
oob_errors[i] <- temp_model$err.rate[500, 1]
}
# 绘制mtry与误差关系
data_mtry <- data.frame(mtry = 1:10, OOB_Error = oob_errors)
ggplot(data_mtry, aes(x = mtry, y = OOB_Error)) +
geom_line(color = "red") +
labs(title = "不同mtry值的OOB误差率", x = "mtry值", y = "OOB误差率")
样本相似性可视化(MDS图)
# 计算距离矩阵
dist_matrix <- as.dist(1 - model$proximity)
# MDS转换
mds <- cmdscale(dist_matrix, eig = TRUE, k = 2)
# 整理数据
mds_data <- data.frame(X = mds$points[,1], Y = mds$points[,2], Class = data_imputed$hd)
# 绘制MDS图
ggplot(mds_data, aes(x = X, y = Y, color = Class)) +
geom_point() +
labs(title = "MDS多维尺度图", x = "第一维", y = "第二维")