library(tidyverse)
library(lubridate)15 数据清洗
16 数据清洗
真实世界的数据从来都不是”干净”的。缺失值、异常值、格式不一致、重复记录……这些问题在生态学数据中尤为常见。数据清洗是数据分析的第一步,也是最重要的一步——垃圾进,垃圾出(Garbage In, Garbage Out)。
16.1 示例数据
我们使用一个模拟的土壤调查数据集来演示数据清洗的各个步骤:
set.seed(2027)
soil_raw <- tibble(
site_id = rep(paste0("S", sprintf("%02d", 1:20)), each = 3),
depth = rep(c("0-10", "10-20", "20-30"), 20),
ph = c(rnorm(58, 6.2, 0.8), NA, -1.5), # 含缺失值和异常值
organic_carbon = c(rnorm(55, 25, 8), NA, NA, NA, 150, -5), # 含缺失值和异常值
nitrogen = c(rnorm(57, 2.1, 0.6), NA, NA, 0),
texture = c(rep("clay", 15), rep("loam", 15), rep("sand", 15),
"Clay", "LOAM", " sand ", "clay ", "loam", # 格式不一致
rep("clay", 5), rep("loam", 5)),
date = c(rep("2027-03-15", 30), rep("2027/03/16", 15), # 日期格式不一致
rep("15-Mar-2027", 10), rep("2027-03-17", 5))
)
# 添加重复行
soil_raw <- bind_rows(soil_raw, soil_raw[c(1, 1, 5), ])
head(soil_raw, 10)
dim(soil_raw)16.2 第一步:了解你的数据
在动手清洗之前,先全面了解数据的基本情况:
# 数据维度
dim(soil_raw)
# 数据结构
str(soil_raw)
# 描述性统计
summary(soil_raw)
# 每列缺失值数量
colSums(is.na(soil_raw))16.3 处理重复值
# 检查重复行
sum(duplicated(soil_raw))
# 查看重复的行
soil_raw[duplicated(soil_raw), ]
# 去除重复行
soil <- soil_raw |>
distinct()
cat("去重前:", nrow(soil_raw), "行\n")
cat("去重后:", nrow(soil), "行\n")16.4 处理异常值
Important为什么先处理异常值?
异常值会影响均值、中位数等统计量的计算。如果先填充缺失值,填充值会被异常值”污染”。因此,正确的顺序是:先移除/修正异常值,再填充缺失值。
16.4.1 识别异常值
# 箱线图识别异常值
soil |>
select(where(is.numeric)) |>
pivot_longer(everything(), names_to = "variable", values_to = "value") |>
ggplot(aes(x = variable, y = value)) +
geom_boxplot(fill = "lightblue") +
facet_wrap(~variable, scales = "free") +
labs(title = "数值变量箱线图(异常值检测)") +
theme_minimal()16.4.2 IQR 方法
# 用 IQR 方法检测异常值
detect_outliers <- function(x) {
q1 <- quantile(x, 0.25, na.rm = TRUE)
q3 <- quantile(x, 0.75, na.rm = TRUE)
iqr <- q3 - q1
lower <- q1 - 1.5 * iqr
upper <- q3 + 1.5 * iqr
x < lower | x > upper
}
# 检测各列异常值
soil |>
summarise(
ph_outliers = sum(detect_outliers(ph), na.rm = TRUE),
oc_outliers = sum(detect_outliers(organic_carbon), na.rm = TRUE),
n_outliers = sum(detect_outliers(nitrogen), na.rm = TRUE)
)16.4.3 处理异常值
# 查看异常值
soil |>
filter(detect_outliers(ph) | detect_outliers(organic_carbon)) |>
select(site_id, depth, ph, organic_carbon)
# 将不合理的值替换为 NA(pH 不可能为负数,有机碳不可能为 150 或负数)
soil_clean <- soil |>
mutate(
ph = if_else(ph < 0 | ph > 14, NA_real_, ph),
organic_carbon = if_else(organic_carbon < 0 | organic_carbon > 100, NA_real_, organic_carbon),
nitrogen = if_else(nitrogen < 0, NA_real_, nitrogen)
)
Note异常值 ≠ 错误值
异常值不一定是错误的。在生态学中,极端值可能反映了真实的生态现象(如极端干旱下的土壤 pH 变化)。处理异常值时要结合专业知识判断,不能机械地删除。
16.5 处理缺失值
16.5.1 识别缺失值
# 每列缺失值统计
soil_clean |>
summarise(across(everything(), ~sum(is.na(.)))) |>
pivot_longer(everything(), names_to = "variable", values_to = "n_missing") |>
filter(n_missing > 0)16.5.2 可视化缺失模式
# 简单的缺失值热力图
soil_clean |>
mutate(row = row_number()) |>
pivot_longer(-row, names_to = "variable", values_to = "value") |>
mutate(is_missing = is.na(value)) |>
ggplot(aes(x = variable, y = row, fill = is_missing)) +
geom_tile() +
scale_fill_manual(values = c("grey90", "red"), labels = c("有值", "缺失")) +
labs(title = "缺失值分布", x = "", y = "行号", fill = "") +
theme_minimal()16.5.3 处理策略
根据缺失的原因和比例,选择不同的处理方式:
| 策略 | 适用场景 | R 实现 |
|---|---|---|
| 删除行 | 缺失比例低(<5%),随机缺失 | drop_na() |
| 删除列 | 某列缺失比例过高(>50%) | select(-col) |
| 均值/中位数填充 | 数值型,缺失比例适中 | replace_na() |
| 分组填充 | 缺失值与分组有关 | group_by() + fill() |
| 标记为单独类别 | 分类变量 | replace_na(list(x = "未知")) |
# 方法1:删除含缺失值的行
soil_complete <- soil_clean |> drop_na()
cat("删除缺失后:", nrow(soil_complete), "行\n")
# 方法2:用中位数填充数值型缺失值(异常值已移除,中位数更可靠)
soil_clean <- soil_clean |>
mutate(
ph = replace_na(ph, median(ph, na.rm = TRUE)),
organic_carbon = replace_na(organic_carbon, median(organic_carbon, na.rm = TRUE)),
nitrogen = replace_na(nitrogen, median(nitrogen, na.rm = TRUE))
)
Warning缺失值处理的注意事项
- 先搞清楚缺失的原因(随机缺失 vs 系统性缺失)
- 不要盲目删除——可能引入偏差
- 不要盲目填充——可能掩盖真实模式
- 在论文中必须报告缺失值的处理方式
16.6 统一数据格式
16.6.1 字符串清洗
# 查看 texture 列的唯一值
unique(soil_clean$texture)
# 统一格式:去空格、转小写
soil_clean <- soil_clean |>
mutate(
texture = texture |>
str_trim() |> # 去除首尾空格
str_to_lower() # 转为小写
)
unique(soil_clean$texture)16.6.2 日期格式统一
# 查看日期格式
unique(soil_clean$date)
# 使用 lubridate 统一日期格式
soil_clean <- soil_clean |>
mutate(
date = parse_date_time(date, orders = c("ymd", "y/m/d", "d-b-Y")) |>
as.Date()
)
unique(soil_clean$date)
class(soil_clean$date)16.7 数据类型转换
# 将 texture 转为因子
soil_clean <- soil_clean |>
mutate(
texture = factor(texture, levels = c("sand", "loam", "clay")),
depth = factor(depth, levels = c("0-10", "10-20", "20-30"), ordered = TRUE)
)
str(soil_clean)16.8 完整的数据清洗流水线
将以上步骤整合为一个完整的流水线:
soil_final <- soil_raw |>
# 1. 去重
distinct() |>
# 2. 统一字符串格式
mutate(
texture = texture |> str_trim() |> str_to_lower()
) |>
# 3. 统一日期格式
mutate(
date = parse_date_time(date, orders = c("ymd", "y/m/d", "d-b-Y")) |> as.Date()
) |>
# 4. 处理不合理的值(异常值 → NA)
mutate(
ph = if_else(ph < 0 | ph > 14, NA_real_, ph),
organic_carbon = if_else(organic_carbon < 0 | organic_carbon > 100, NA_real_, organic_carbon),
nitrogen = if_else(nitrogen < 0, NA_real_, nitrogen)
) |>
# 5. 填充缺失值(中位数,此时异常值已移除,统计量更准确)
mutate(
ph = replace_na(ph, median(ph, na.rm = TRUE)),
organic_carbon = replace_na(organic_carbon, median(organic_carbon, na.rm = TRUE)),
nitrogen = replace_na(nitrogen, median(nitrogen, na.rm = TRUE))
) |>
# 6. 转换数据类型
mutate(
texture = factor(texture, levels = c("sand", "loam", "clay")),
depth = factor(depth, levels = c("0-10", "10-20", "20-30"), ordered = TRUE)
)
# 验证
summary(soil_final)16.9 清洗前后对比报告
# 构建对比报告
report <- tibble(
指标 = c("总行数", "重复行数", "缺失值总数", "pH 异常值", "有机碳异常值", "氮异常值",
"日期格式数", "texture 类别数"),
清洗前 = c(
nrow(soil_raw),
sum(duplicated(soil_raw)),
sum(is.na(soil_raw)),
sum(soil_raw$ph < 0 | soil_raw$ph > 14, na.rm = TRUE),
sum(soil_raw$organic_carbon < 0 | soil_raw$organic_carbon > 100, na.rm = TRUE),
sum(soil_raw$nitrogen < 0, na.rm = TRUE),
length(unique(soil_raw$date)),
length(unique(soil_raw$texture))
),
清洗后 = c(
nrow(soil_final),
sum(duplicated(soil_final)),
sum(is.na(soil_final)),
sum(soil_final$ph < 0 | soil_final$ph > 14, na.rm = TRUE),
sum(soil_final$organic_carbon < 0 | soil_final$organic_carbon > 100, na.rm = TRUE),
sum(soil_final$nitrogen < 0, na.rm = TRUE),
length(unique(soil_final$date)),
length(unique(as.character(soil_final$texture)))
)
)
report16.10 课后练习
- 下载一个真实的生态学数据集(如从 GBIF 下载物种观测数据)
- 检查数据中的缺失值、异常值和格式问题
- 编写一个完整的数据清洗流水线
- 对比清洗前后的数据质量(缺失值数量、异常值数量、数据类型)
- 将清洗脚本和清洗报告提交到 GitHub