library(tidyverse)
library(corrplot)
library(patchwork)20 综合实战案例
21 综合实战案例
本章通过一个完整的生态学案例,演示从原始数据到可视化报告的全流程。我们将分析植物多样性与土壤性质的关系。
21.1 案例背景
研究问题:广西某自然保护区内,不同海拔梯度上植物多样性与土壤理化性质有何关系?
数据来源:沿海拔梯度设置 40 个样方(200-1200m),每个样方记录植物物种数、Shannon 多样性指数,并采集表层土壤(0-20cm)测定理化性质。
21.2 第一步:数据获取与读取
# 模拟原始数据(实际项目中从 CSV 读取)
set.seed(42)
n <- 40
# 先生成海拔,后续变量都基于同一个海拔值
elevation <- sort(runif(n, 200, 1200))
# 物种丰富度随海拔递减(中低海拔最高)
richness <- rpois(n, lambda = pmax(5, 20 - 0.01 * elevation))
raw_data <- tibble(
plot_id = paste0("P", sprintf("%02d", 1:n)),
elevation = elevation,
richness = richness,
shannon = 1.5 + 0.8 * log(pmax(richness, 1)) + rnorm(n, 0, 0.2),
cover_pct = runif(n, 40, 95),
soil_ph = 5.0 + 0.001 * elevation + rnorm(n, 0, 0.3),
organic_carbon = 40 - 0.02 * elevation + rlnorm(n, 0, 0.3),
total_nitrogen = (40 - 0.02 * elevation + rlnorm(n, 0, 0.3)) / 12 + rnorm(n, 0, 0.2),
soil_moisture = 25 + 0.015 * elevation + rnorm(n, 0, 5),
slope = runif(n, 5, 45),
aspect = sample(c("N", "S", "E", "W"), n, replace = TRUE),
habitat = c(rep("常绿阔叶林", 15), rep("落叶阔叶林", 10),
rep("针阔混交林", 10), rep("亚高山灌丛", 5))
)
# 人为加入一些数据质量问题
raw_data$soil_ph[c(5, 22)] <- NA
raw_data$organic_carbon[38] <- -3.2
raw_data$richness[15] <- NA
raw_data$cover_pct[c(10, 30)] <- c(105, -2)
head(raw_data)21.3 第二步:数据清洗
# 检查数据质量
cat("数据维度:", dim(raw_data), "\n")
cat("缺失值:\n")
colSums(is.na(raw_data))
# 清洗流水线
clean_data <- raw_data |>
# 处理不合理的值
mutate(
organic_carbon = if_else(organic_carbon < 0, NA_real_, organic_carbon),
cover_pct = if_else(cover_pct < 0 | cover_pct > 100, NA_real_, cover_pct)
) |>
# 填充缺失值
mutate(
soil_ph = replace_na(soil_ph, median(soil_ph, na.rm = TRUE)),
organic_carbon = replace_na(organic_carbon, median(organic_carbon, na.rm = TRUE)),
cover_pct = replace_na(cover_pct, median(cover_pct, na.rm = TRUE))
) |>
# 删除物种数缺失的行(核心变量不填充)
filter(!is.na(richness)) |>
# 计算衍生变量
mutate(
cn_ratio = organic_carbon / total_nitrogen,
elev_band = cut(elevation, breaks = c(0, 400, 700, 1000, 1500),
labels = c("低海拔", "中低海拔", "中高海拔", "高海拔")),
log_biomass = log(organic_carbon)
)
cat("\n清洗后:", nrow(clean_data), "行\n")
summary(clean_data |> select(where(is.numeric)))21.4 第三步:探索性数据分析
21.4.1 描述性统计
clean_data |>
group_by(habitat) |>
summarise(
n = n(),
richness_mean = round(mean(richness), 1),
richness_sd = round(sd(richness), 1),
shannon_mean = round(mean(shannon), 2),
ph_mean = round(mean(soil_ph), 2),
oc_mean = round(mean(organic_carbon), 1)
)21.4.2 分布与相关性
cor_vars <- clean_data |>
select(richness, shannon, elevation, soil_ph, organic_carbon,
total_nitrogen, soil_moisture, slope)
corrplot(cor(cor_vars, use = "complete.obs"),
method = "color", type = "lower",
addCoef.col = "black", number.cex = 0.6,
tl.col = "black", tl.cex = 0.7,
col = colorRampPalette(c("#2166ac", "white", "#b2182b"))(200))21.5 第四步:核心分析与可视化
21.5.1 海拔梯度上的多样性变化
p1 <- ggplot(clean_data, aes(x = elevation, y = richness)) +
geom_point(aes(color = habitat), size = 2.5, alpha = 0.7) +
geom_smooth(method = "loess", color = "black", linewidth = 0.8) +
labs(x = "海拔 (m)", y = "物种丰富度", color = "植被类型") +
theme_bw()
p2 <- ggplot(clean_data, aes(x = elevation, y = shannon)) +
geom_point(aes(color = habitat), size = 2.5, alpha = 0.7) +
geom_smooth(method = "loess", color = "black", linewidth = 0.8) +
labs(x = "海拔 (m)", y = "Shannon 指数", color = "植被类型") +
theme_bw()
p1 + p2 + plot_layout(guides = "collect") &
theme(legend.position = "bottom")21.5.2 土壤因子与多样性的关系
p_ph <- ggplot(clean_data, aes(x = soil_ph, y = richness)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "lm", color = "steelblue") +
labs(x = "土壤 pH", y = "物种丰富度") +
theme_bw()
p_oc <- ggplot(clean_data, aes(x = organic_carbon, y = richness)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "lm", color = "coral") +
labs(x = "有机碳 (g/kg)", y = "物种丰富度") +
theme_bw()
p_moist <- ggplot(clean_data, aes(x = soil_moisture, y = richness)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "lm", color = "forestgreen") +
labs(x = "土壤含水量 (%)", y = "物种丰富度") +
theme_bw()
p_ph + p_oc + p_moist +
plot_annotation(tag_levels = "a", tag_prefix = "(", tag_suffix = ")")21.5.3 不同植被类型的比较
ggplot(clean_data, aes(x = reorder(habitat, -richness), y = richness, fill = habitat)) +
geom_boxplot(alpha = 0.7, outlier.shape = NA) +
geom_jitter(width = 0.15, alpha = 0.4, size = 1.5) +
labs(x = "", y = "物种丰富度",
title = "不同植被类型的物种丰富度比较") +
scale_fill_brewer(palette = "Set2") +
theme_bw() +
theme(legend.position = "none")21.6 第五步:简单统计检验
# 线性回归:物种丰富度 ~ 海拔
model <- lm(richness ~ elevation + soil_ph + organic_carbon + soil_moisture, data = clean_data)
summary(model)# 方差分析:不同植被类型的物种丰富度差异
anova_result <- aov(richness ~ habitat, data = clean_data)
summary(anova_result)21.7 第六步:数据导出与元数据记录
一个完整的分析项目应该保存处理后的数据,并生成数据字典。
# 自动生成 codebook
codebook <- tibble(
variable = names(clean_data),
type = map_chr(clean_data, ~class(.x)[1]),
n_missing = map_int(clean_data, ~sum(is.na(.x))),
n_unique = map_int(clean_data, ~n_distinct(.x)),
example = map_chr(clean_data, ~paste(head(unique(.x), 3), collapse = ", "))
)
codebook |> filter(type %in% c("numeric", "integer"))# 保存清洗后的数据和 codebook
write_csv(clean_data, "data/processed/biodiversity_clean.csv")
saveRDS(clean_data, "data/processed/biodiversity_clean.rds")
write_csv(codebook, "data/processed/biodiversity_codebook.csv")
Tip项目目录结构建议
my_project/
├── data/
│ ├── raw/ # 原始数据(只读)
│ └── processed/ # 清洗后的数据 + codebook
├── analysis.qmd # 分析脚本(本文件)
├── report.html # 渲染后的报告
└── README.md # 项目说明和元数据
21.8 第七步:生成可重复性报告
本章的所有代码都写在一个 .qmd 文件中,可以一键渲染为 HTML 报告。这就是可重复性研究的核心——任何人拿到你的数据和代码,都能重现完全相同的结果。
21.8.1 可重复性检查清单
21.9 课后练习
- 选择一个你感兴趣的生态学问题,从公共数据库下载数据
- 按照本章的七步流程完成完整的数据分析(获取→清洗→探索→分析→检验→导出→报告)
- 将分析写成 Quarto 文档,渲染为 HTML 报告
- 将项目(数据 + 代码 + 报告)提交到 GitHub