教学内容
- 本节课主要介绍如何运用R语言进行双因素方差检验。
- 以下内容仅用于北京林业大学生态与自然保护学院《生物统计学》的第五节上机内容。
参考资料
- https://rcompanion.org/handbook/
- 《R语言与统计分析》(汤银才著)
数据下载链接
R语言教程
# 预处理
# 清理内存
rm(list = ls())
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 463364 24.8 1002031 53.6 643731 34.4
## Vcells 833007 6.4 8388608 64.0 1649543 12.6
# 加载packages
# 如果packages加载没成功
# install.packages("agricolae")
# install.packages("car")
# install.packages("psych")
# install.packages("skimr")
# install.packages("phia")
library(agricolae)
library(car)
library(psych)
library(skimr)
library(phia)
# 加载工作区域
# 需要在setwd里面设置本地磁盘对应的文件路径
setwd("I:/2018年生物统计课程/数据")
getwd()
## [1] "I:/2018年生物统计课程/数据"
双因素方差检验 Two-way ANOVA
练习题1:香菇草是来自于欧洲的一种外来归化园艺植物。香菇草的归化潜力将会受到本土环境种间竞争关系和繁殖体初始数量的共同影响。本研究将香菇草分别种植在种间竞争环境和对照环境(with vs. without),以及设定香菇草的初始数量为1株或5株(1 vs. 5),经过数月培养,最终获得平均生物量指标。请分析两种因素对香菇草生长能力是否存在显著性影响?
competition | no.individuals | biomass |
---|---|---|
without | single | 20.89 |
without | single | 26.22 |
without | single | 33.55 |
without | single | 29.89 |
without | single | 27.61 |
without | five | 9.82 |
without | five | 8.42 |
without | five | 12.02 |
without | five | 7.21 |
without | five | 10.37 |
with | single | 2.51 |
with | single | 1.13 |
with | single | 0.29 |
with | single | 0.56 |
with | single | 1.65 |
with | five | 1.61 |
with | five | 1.33 |
with | five | 1.32 |
with | five | 1.65 |
with | five | 1.44 |
数据预处理过程
# 加载数据
# 需要在read.csv里面设置本地磁盘对应的文件路径
competition.data <- read.csv("I:/2018年生物统计课程/数据/lesson_05_two_way_aov.csv")
# 将处理变量competition变为处理因子
# 将处理变量no.individuals变为处理因子
competition.data$competition <- as.factor(competition.data$competition)
competition.data$no.individuals <- as.factor(competition.data$no.individuals)
# 统计描述过程
# 查看数据格式
head(competition.data) # 展示数据前六行
## competition no.individuals biomass
## 1 without single 20.89
## 2 without single 26.22
## 3 without single 33.55
## 4 without single 29.89
## 5 without single 27.61
## 6 without five 9.82
str(competition.data) # 展示数据格式
## 'data.frame': 20 obs. of 3 variables:
## $ competition : Factor w/ 2 levels "with","without": 2 2 2 2 2 2 2 2 2 2 ...
## $ no.individuals: Factor w/ 2 levels "five","single": 2 2 2 2 2 1 1 1 1 1 ...
## $ biomass : num 20.9 26.2 33.5 29.9 27.6 ...
# 使用skimr包进行更具体的统计描述
skim(competition.data)
Data summary
Name | competition.data |
---|---|
Number of rows | 20 |
Number of columns | 3 |
————————- | |
Column type frequency: | |
factor | 2 |
numeric | 1 |
————————- | |
Group variables | None |
Variable type: factor
skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
---|---|---|---|---|---|
competition | 0 | 1 | FALSE | 2 | wit: 10, wit: 10 |
no.individuals | 0 | 1 | FALSE | 2 | fiv: 10, sin: 10 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
biomass | 0 | 1 | 9.97 | 11.26 | 0.29 | 1.41 | 4.86 | 14.24 | 33.55 | ▇▃▁▂▂ |
方差分析过程
# 方差齐性和正太性检验的具体过程
# 数据不满足方差齐性,对数据进行sqrt转化
competition.data$biomass.sqrt <- sqrt(competition.data$biomass)
# step1:使用leveneTest进行方差齐性检验
leveneTest(biomass.sqrt ~ competition*no.individuals, data = competition.data)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 3 1.8956 0.1711
## 16
# step2:使用qqPlot对数据进行qq图绘制
# 对数据进行线性拟合
# library(car)
fit.lm <- lm(biomass.sqrt ~ competition + no.individuals + competition:no.individuals, data = competition.data)
# 做qqPlot,主观判断数据是否符合正太分布
qqPlot(fit.lm, # 拟合方程
simulate = TRUE, # 分布模拟?
labels = TRUE, # 对异常值进行标记?
col = "red", # 点的颜色
main = "qqplot", # 总标题
ylab = "Studentized residual" # y轴标题
)
## [1] 1 3
# step3:使用shapiro.test对数据进行正态性检验
# 在做正态性检验之前,需要对处理进行重命名,以方便分组
competition.data$treatment.group <- paste(competition.data$competition, competition.data$no.individuals, sep ="_")
# 一般而言,
# 当分析大于50行的大样本数据时,用Kolmogorov-Smirnov检验;
# 当分析小于50行的小样本数据时,用Shapiro-Wilk 检验;
# shapiro.test(x)
# 使用tapply功能对数据进行shapiro.test的批量处理
tapply(competition.data$biomass.sqrt, competition.data$treatment.group, shapiro.test)
## $with_five
##
## Shapiro-Wilk normality test
##
## data: X[[i]]
## W = 0.86732, p-value = 0.2557
##
##
## $with_single
##
## Shapiro-Wilk normality test
##
## data: X[[i]]
## W = 0.98082, p-value = 0.9389
##
##
## $without_five
##
## Shapiro-Wilk normality test
##
## data: X[[i]]
## W = 0.98846, p-value = 0.974
##
##
## $without_single
##
## Shapiro-Wilk normality test
##
## data: X[[i]]
## W = 0.97921, p-value = 0.9303
# 方差分析的具体过程
# step1:使用lm对数据进行线性拟合
# fit.lm <- lm(biomass.sqrt ~ competition + no.individuals + competition:no.individuals, data = competition.data)
# step2: 使用car包的Anova对数据进行方差分析
# SPSS默认的方差分析类型为III型
# library(car)
Anova(fit.lm, type = "III")
## Anova Table (Type III tests)
##
## Response: biomass.sqrt
## Sum Sq Df F value Pr(>F)
## (Intercept) 7.3339 1 62.1413 6.710e-07 ***
## competition 8.7464 1 74.1097 2.115e-07 ***
## no.individuals 0.0700 1 0.5934 0.4523
## competition:no.individuals 6.7676 1 57.3429 1.122e-06 ***
## Residuals 1.8883 16
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# step3.1: 使用agricolae包的Tukey检验对主效应competition进行多重比较
# library(agricolae)
comparison_competition <- HSD.test(fit.lm, # 拟合的模型
"competition", # 处理因子
group = TRUE,
main = "Tukey comparison")
# 展示competition的结果
comparison_competition
## $statistics
## MSerror Df Mean CV MSD
## 0.1180203 16 2.644353 12.99149 0.325694
##
## $parameters
## test name.t ntr StudentizedRange alpha
## Tukey competition 2 2.997999 0.05
##
## $means
## biomass.sqrt std r Min Max Q25 Q50 Q75
## with 1.127423 0.2942346 10 0.5385165 1.584298 1.084489 1.176628 1.280607
## without 4.161283 1.1943034 10 2.6851443 5.792236 3.155328 4.018773 5.221028
##
## $comparison
## NULL
##
## $groups
## biomass.sqrt groups
## without 4.161283 a
## with 1.127423 b
##
## attr(,"class")
## [1] "group"
# step3.2: 使用agricolae包的Tukey检验对主效应no.individuals数据进行多重比较
# library(agricolae)
comparison_no.individuals <- HSD.test(fit.lm, # 拟合的模型
"no.individuals", # 处理因子
group = TRUE,
main = "Tukey comparison")
# 展示no.individuals分析的结果
comparison_no.individuals
## $statistics
## MSerror Df Mean CV MSD
## 0.1180203 16 2.644353 12.99149 0.325694
##
## $parameters
## test name.t ntr StudentizedRange alpha
## Tukey no.individuals 2 2.997999 0.05
##
## $means
## biomass.sqrt std r Min Max Q25 Q50 Q75
## five 2.146334 1.006782 10 1.1489125 3.466987 1.217214 1.984834 3.075697
## single 3.142372 2.249814 10 0.5385165 5.792236 1.118392 3.077428 5.221028
##
## $comparison
## NULL
##
## $groups
## biomass.sqrt groups
## single 3.142372 a
## five 2.146334 b
##
## attr(,"class")
## [1] "group"
# step4:
# 当不同处理存在交互作用时,需要通过简单效应分析对交互作用进行检验;
# 为了保证方差分析总体的误差项一致,一般不建议通过事后检验进行交互作用的检验;
# 需要通过phia包中的testInteractions功能固定其中一个处理水平,
# 对另一个处理水平进行方差分析
# 简单效应结果表明(1)在竞争情况下,初始个体数量对平均生物量的影响不显著;
# (2)在无竞争情况下,初始个体数量对平均生物量的影响十分显著。
library(phia)
testInteractions(fit.lm, fixed = "competition", across = "no.individuals")
## F Test:
## P-value adjustment method: holm
## Value Df Sum of Sq F Pr(>F)
## with 0.16737 1 0.0700 0.5934 0.4523
## without -2.15945 1 11.6581 98.7801 5.983e-08 ***
## Residuals 16 1.8883
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# 使用boxplot函数对数据进行盒状图的构建
boxplot(biomass.sqrt ~ no.individuals * competition, # 因变量与自变量之间的关系 y ~ x1 + x2 + x1:x2
data = competition.data, # 数据来源
ylim = c(0, 8), # y轴范围
xlab = "Treatment", # x轴标题
ylab = "Biomass (g, squared)", # y轴标题
col = c("lightblue", "pink"),
names = rep("", 4) # 修改x轴的刻度的名字
)
# 调整刻度与刻度文字之间的距离
# https://blog.csdn.net/weixin_40628687/article/details/79254791
axis(side = 1, at = 1:4,
labels = c("With comp.\n /5 indv.",
"With comp.\n /1 indv.",
"Without comp.\n /5 indv.",
"Without comp.\n /1 indv."), padj = 0.5)
# 在盒状图中添加简单效应分析的结果
text(x = c(1.5, 3.5), # 字符的x轴位置
y = c(3, 7), # 字符的y轴位置
labels = c("n.s.", "***") # 输出本文字符
)