15  数据清洗

16 数据清洗

真实世界的数据从来都不是”干净”的。缺失值、异常值、格式不一致、重复记录……这些问题在生态学数据中尤为常见。数据清洗是数据分析的第一步,也是最重要的一步——垃圾进,垃圾出(Garbage In, Garbage Out)。

library(tidyverse)
library(lubridate)

16.1 示例数据

我们使用一个模拟的土壤调查数据集来演示数据清洗的各个步骤:

set.seed(2027)

soil_raw <- tibble(
  site_id = rep(paste0("S", sprintf("%02d", 1:20)), each = 3),
  depth = rep(c("0-10", "10-20", "20-30"), 20),
  ph = c(rnorm(58, 6.2, 0.8), NA, -1.5),                    # 含缺失值和异常值
  organic_carbon = c(rnorm(55, 25, 8), NA, NA, NA, 150, -5), # 含缺失值和异常值
  nitrogen = c(rnorm(57, 2.1, 0.6), NA, NA, 0),
  texture = c(rep("clay", 15), rep("loam", 15), rep("sand", 15),
              "Clay", "LOAM", " sand ", "clay ", "loam",      # 格式不一致
              rep("clay", 5), rep("loam", 5)),
  date = c(rep("2027-03-15", 30), rep("2027/03/16", 15),     # 日期格式不一致
           rep("15-Mar-2027", 10), rep("2027-03-17", 5))
)

# 添加重复行
soil_raw <- bind_rows(soil_raw, soil_raw[c(1, 1, 5), ])

head(soil_raw, 10)
dim(soil_raw)

16.2 第一步:了解你的数据

在动手清洗之前,先全面了解数据的基本情况:

# 数据维度
dim(soil_raw)

# 数据结构
str(soil_raw)

# 描述性统计
summary(soil_raw)

# 每列缺失值数量
colSums(is.na(soil_raw))

16.3 处理重复值

# 检查重复行
sum(duplicated(soil_raw))

# 查看重复的行
soil_raw[duplicated(soil_raw), ]

# 去除重复行
soil <- soil_raw |>
  distinct()

cat("去重前:", nrow(soil_raw), "行\n")
cat("去重后:", nrow(soil), "行\n")

16.4 处理异常值

Important为什么先处理异常值?

异常值会影响均值、中位数等统计量的计算。如果先填充缺失值,填充值会被异常值”污染”。因此,正确的顺序是:先移除/修正异常值,再填充缺失值。

16.4.1 识别异常值

# 箱线图识别异常值
soil |>
  select(where(is.numeric)) |>
  pivot_longer(everything(), names_to = "variable", values_to = "value") |>
  ggplot(aes(x = variable, y = value)) +
  geom_boxplot(fill = "lightblue") +
  facet_wrap(~variable, scales = "free") +
  labs(title = "数值变量箱线图(异常值检测)") +
  theme_minimal()

16.4.2 IQR 方法

# 用 IQR 方法检测异常值
detect_outliers <- function(x) {
  q1 <- quantile(x, 0.25, na.rm = TRUE)
  q3 <- quantile(x, 0.75, na.rm = TRUE)
  iqr <- q3 - q1
  lower <- q1 - 1.5 * iqr
  upper <- q3 + 1.5 * iqr
  x < lower | x > upper
}

# 检测各列异常值
soil |>
  summarise(
    ph_outliers = sum(detect_outliers(ph), na.rm = TRUE),
    oc_outliers = sum(detect_outliers(organic_carbon), na.rm = TRUE),
    n_outliers = sum(detect_outliers(nitrogen), na.rm = TRUE)
  )

16.4.3 处理异常值

# 查看异常值
soil |>
  filter(detect_outliers(ph) | detect_outliers(organic_carbon)) |>
  select(site_id, depth, ph, organic_carbon)

# 将不合理的值替换为 NA(pH 不可能为负数,有机碳不可能为 150 或负数)
soil_clean <- soil |>
  mutate(
    ph = if_else(ph < 0 | ph > 14, NA_real_, ph),
    organic_carbon = if_else(organic_carbon < 0 | organic_carbon > 100, NA_real_, organic_carbon),
    nitrogen = if_else(nitrogen < 0, NA_real_, nitrogen)
  )
Note异常值 ≠ 错误值

异常值不一定是错误的。在生态学中,极端值可能反映了真实的生态现象(如极端干旱下的土壤 pH 变化)。处理异常值时要结合专业知识判断,不能机械地删除。

16.5 处理缺失值

16.5.1 识别缺失值

# 每列缺失值统计
soil_clean |>
  summarise(across(everything(), ~sum(is.na(.)))) |>
  pivot_longer(everything(), names_to = "variable", values_to = "n_missing") |>
  filter(n_missing > 0)

16.5.2 可视化缺失模式

# 简单的缺失值热力图
soil_clean |>
  mutate(row = row_number()) |>
  pivot_longer(-row, names_to = "variable", values_to = "value") |>
  mutate(is_missing = is.na(value)) |>
  ggplot(aes(x = variable, y = row, fill = is_missing)) +
  geom_tile() +
  scale_fill_manual(values = c("grey90", "red"), labels = c("有值", "缺失")) +
  labs(title = "缺失值分布", x = "", y = "行号", fill = "") +
  theme_minimal()

16.5.3 处理策略

根据缺失的原因和比例,选择不同的处理方式:

策略 适用场景 R 实现
删除行 缺失比例低(<5%),随机缺失 drop_na()
删除列 某列缺失比例过高(>50%) select(-col)
均值/中位数填充 数值型,缺失比例适中 replace_na()
分组填充 缺失值与分组有关 group_by() + fill()
标记为单独类别 分类变量 replace_na(list(x = "未知"))
# 方法1:删除含缺失值的行
soil_complete <- soil_clean |> drop_na()
cat("删除缺失后:", nrow(soil_complete), "行\n")

# 方法2:用中位数填充数值型缺失值(异常值已移除,中位数更可靠)
soil_clean <- soil_clean |>
  mutate(
    ph = replace_na(ph, median(ph, na.rm = TRUE)),
    organic_carbon = replace_na(organic_carbon, median(organic_carbon, na.rm = TRUE)),
    nitrogen = replace_na(nitrogen, median(nitrogen, na.rm = TRUE))
  )
Warning缺失值处理的注意事项
  • 先搞清楚缺失的原因(随机缺失 vs 系统性缺失)
  • 不要盲目删除——可能引入偏差
  • 不要盲目填充——可能掩盖真实模式
  • 在论文中必须报告缺失值的处理方式

16.6 统一数据格式

16.6.1 字符串清洗

# 查看 texture 列的唯一值
unique(soil_clean$texture)

# 统一格式:去空格、转小写
soil_clean <- soil_clean |>
  mutate(
    texture = texture |>
      str_trim() |>          # 去除首尾空格
      str_to_lower()         # 转为小写
  )

unique(soil_clean$texture)

16.6.2 日期格式统一

# 查看日期格式
unique(soil_clean$date)

# 使用 lubridate 统一日期格式
soil_clean <- soil_clean |>
  mutate(
    date = parse_date_time(date, orders = c("ymd", "y/m/d", "d-b-Y")) |>
      as.Date()
  )

unique(soil_clean$date)
class(soil_clean$date)

16.7 数据类型转换

# 将 texture 转为因子
soil_clean <- soil_clean |>
  mutate(
    texture = factor(texture, levels = c("sand", "loam", "clay")),
    depth = factor(depth, levels = c("0-10", "10-20", "20-30"), ordered = TRUE)
  )

str(soil_clean)

16.8 完整的数据清洗流水线

将以上步骤整合为一个完整的流水线:

soil_final <- soil_raw |>
  # 1. 去重
  distinct() |>
  # 2. 统一字符串格式
  mutate(
    texture = texture |> str_trim() |> str_to_lower()
  ) |>
  # 3. 统一日期格式
  mutate(
    date = parse_date_time(date, orders = c("ymd", "y/m/d", "d-b-Y")) |> as.Date()
  ) |>
  # 4. 处理不合理的值(异常值 → NA)
  mutate(
    ph = if_else(ph < 0 | ph > 14, NA_real_, ph),
    organic_carbon = if_else(organic_carbon < 0 | organic_carbon > 100, NA_real_, organic_carbon),
    nitrogen = if_else(nitrogen < 0, NA_real_, nitrogen)
  ) |>
  # 5. 填充缺失值(中位数,此时异常值已移除,统计量更准确)
  mutate(
    ph = replace_na(ph, median(ph, na.rm = TRUE)),
    organic_carbon = replace_na(organic_carbon, median(organic_carbon, na.rm = TRUE)),
    nitrogen = replace_na(nitrogen, median(nitrogen, na.rm = TRUE))
  ) |>
  # 6. 转换数据类型
  mutate(
    texture = factor(texture, levels = c("sand", "loam", "clay")),
    depth = factor(depth, levels = c("0-10", "10-20", "20-30"), ordered = TRUE)
  )

# 验证
summary(soil_final)

16.9 清洗前后对比报告

# 构建对比报告
report <- tibble(
  指标 = c("总行数", "重复行数", "缺失值总数", "pH 异常值", "有机碳异常值", "氮异常值",
           "日期格式数", "texture 类别数"),
  清洗前 = c(
    nrow(soil_raw),
    sum(duplicated(soil_raw)),
    sum(is.na(soil_raw)),
    sum(soil_raw$ph < 0 | soil_raw$ph > 14, na.rm = TRUE),
    sum(soil_raw$organic_carbon < 0 | soil_raw$organic_carbon > 100, na.rm = TRUE),
    sum(soil_raw$nitrogen < 0, na.rm = TRUE),
    length(unique(soil_raw$date)),
    length(unique(soil_raw$texture))
  ),
  清洗后 = c(
    nrow(soil_final),
    sum(duplicated(soil_final)),
    sum(is.na(soil_final)),
    sum(soil_final$ph < 0 | soil_final$ph > 14, na.rm = TRUE),
    sum(soil_final$organic_carbon < 0 | soil_final$organic_carbon > 100, na.rm = TRUE),
    sum(soil_final$nitrogen < 0, na.rm = TRUE),
    length(unique(soil_final$date)),
    length(unique(as.character(soil_final$texture)))
  )
)

report

16.10 课后练习

  1. 下载一个真实的生态学数据集(如从 GBIF 下载物种观测数据)
  2. 检查数据中的缺失值、异常值和格式问题
  3. 编写一个完整的数据清洗流水线
  4. 对比清洗前后的数据质量(缺失值数量、异常值数量、数据类型)
  5. 将清洗脚本和清洗报告提交到 GitHub