这里MSE指的是\(\Vert\hat{\beta}-\beta^\star\Vert^2\),由于维数在变大,我们画图的时候纵轴为\(\Vert\hat{\beta}-\beta^\star\Vert^2/(p+1)\)。下面是\(n\geq p\)的情形,模拟\(r\)次。
library(ggplot2)
set.seed(1234)
r <- 50
n <- 200
d <- 20
errors <- numeric(d+1)
ns <- floor(seq(floor(n-1-d),n-1,length=d+1))
for (j in 1:r) {
i <- 1
for (p in ns) {
X <- matrix(rnorm(n * p), n, p)
beta <- rnorm(p+1)
y <- cbind(rep(1, n), X) %*% beta + rnorm(n)
colnames(X) <- paste0("X", 1:p)
data <- data.frame(y = y, X)
model_formula <- as.formula(paste("y ~", paste0("X", 1:p, collapse = "+")))
M <- lm(model_formula, data = data)
err <- sum((M$coefficients - beta)^2)/(p+1)
# error <- generalization_error(M ,HOLDOUT = test, Kfold = F)
errors[i] <- errors[i] + err
i <- i + 1
}
}
errors <- errors / r
p_values <- ns
plot_df <- data.frame(p = p_values+1, error = errors)
ggplot(plot_df, aes(x = p, y = error)) +
geom_line() +
geom_point() +
xlab("beta的维数") +
ylab("beta的平均均方估计误差") +
ggtitle("p<=n情形") +
theme(plot.title = element_text(hjust = 0.5))
plot_df1 <- data.frame(p = p_values[1:d]+1, error = errors[1:d])
ggplot(plot_df1, aes(x = p, y = error)) +
geom_line() +
geom_point() +
xlab("beta的维数") +
ylab("beta的平均均方估计误差") +
ggtitle("p<=n情形,去掉p=n的点") +
theme(plot.title = element_text(hjust = 0.5))
结论:设计阵\(X\in\mathbb{R}^{n\times p}\),\(p<n\)时,一般情况下\(X\)列满秩,从而\(\beta\)的预测误差\(\Vert\hat{\beta}-\beta^\star\Vert^2\)期望存在,并且该值为一个确定的,与\(p\)有关的常数。而\(p=n\)时,\(\beta\)的预测误差似乎不收敛,证明这一点需要结合样本联合密度对\(\frac{\sigma^2}{p}\mathrm{trace}[(X^TX)^{-1}]\)积分。
下面用M-P广义逆方法来看\(n<p\)时的性质:
library(ggplot2)
library(MASS)
set.seed(1234)
r <- 5
n <- 200
d <- 30
errors <- numeric(d+1)
ns <- floor(seq(n-1,n-1+d*n,n))
for (j in 1:r) {
i <- 1
for (p in ns) {
X <- matrix(rnorm(n * p), n, p)
X <- cbind(rep(1, n), X)
beta <- rnorm(p+1)
y <- X %*% beta + rnorm(n)
beta_pred <- ginv(X) %*% y # M-P广义逆
err <- sum((beta_pred - beta)^2)/(p + 1)
errors[i] <- errors[i] + err
i <- i + 1
}
}
errors <- errors / r
p_values <- ns[-1]
plot_df <- data.frame(p = p_values+1, error = errors[-1])
ggplot(plot_df, aes(x = p, y = error)) +
geom_line() +
geom_point() +
xlab("beta的维数") +
ylab("beta的平均均方估计误差") +
ggtitle("p>n情形") +
theme(plot.title = element_text(hjust = 0.5))
这一点不意外。维数趋于无穷平均每一个\(\beta_i\)上的信息量非常少,而线性回归的约束让平均MSE变为略低于1。
一个问题的全部样本为\(\mathbb{D}=\{X,Y\}\),而实际做问题获得的是\(\mathbb{S}\subset\mathbb{D}\)。所有算法集合为\(\{f_1,f_2,\dots,f_\infty\}\),而基于模型假设下面的算法集合为\(\mathcal{F}=\{f_1,f_2,\dots,f_n\}\)(\(n\)也可以是无穷)。通过最小化经验误差,在有限样本\(\mathbb{S}\)课基于模型假设的算法集合\(\mathcal{F}\)上可以找到最佳算法\(\hat{f}_{\mathcal{F},\mathbb{S}}\)(简写为\(\hat{f}_{\mathcal{F}}\))。类似在无限算法集合上可以有\(f_{\mathcal{F},\mathbb{D}}\)(简写为\(f^\star_{\mathcal{F}}\))。再进一步扩展到无限算法集合上的最佳算法\(f^\star\)。
对于在有限样本上选出来的\(\hat{f}_{\mathcal{F}}\),其真实风险为损失函数在全部样本\(\mathbb{D}\)的期望。对应到\(\hat{f}_{\mathcal{F}}\),\(f^\star_{\mathcal{F}}\)和\(f^\star\),的真实风险,可分为 3 层理解。
固定样本容量\(n=200\),对于某一特定的\(p\),随机生成标准正态向量\(\beta^\star\in\mathbb{R}^{p+1}\),再生成\(y=X\beta^\star+\epsilon,\ \epsilon\sim N(0,I_n)\)。参数\(K\)为交叉验证的参数,\(r\)为重复交叉验证的次数。每一次交叉验证先打乱角标生成\(K\)组,每一次选其中一组作为计算泛化误差的集合,其他的作为训练集。下面绘图的MSE均指的是预测的均方误差。
library(ggplot2)
library(MASS)
set.seed(1234)
r <- 10
K <- 5
n <- 200
d <- 150
errors <- numeric(d + 1)
train.errors <- numeric(d + 1)
gen.errors <- numeric(d + 1)
ns <- 1:(d+1)
#ns <- floor(seq(1,n-1,length=d+1))
i <- 1
for (p in ns) {
X <- matrix(rnorm(n * p), n, p)
X <- cbind(rep(1, n), X)
beta <- rnorm(p + 1)
y <- X %*% beta + rnorm(n)
train.errors.r <- rep(0, r)
temp.errors.r <- rep(0, r)
for (j in 1:r) {
indices <- sample(n)
break.points <- round(seq(1, n, length = K + 1))
temp.errors.k <- rep(0, K)
train.errors.k <- rep(0, K)
for (k in 1:K) {
chunk <- indices[break.points[k]:(break.points[k + 1] - 1)]
X.train <- X[-chunk,]
y.train <- y[-chunk]
X.test <- X[chunk,]
y.test <- y[chunk]
beta.train <- ginv(X.train) %*% y.train # M-P广义逆
train.errors.k[k] <- sum((y.train - X.train %*% beta.train)^2) / length(y.train)
temp.errors.k[k] <- sum((y.test - X.test %*% beta.train)^2) / length(y.test)
}
temp.errors.r[j] <- mean(temp.errors.k)
train.errors.r[j] <- mean(train.errors.k)
}
gen.errors[i] <- mean(temp.errors.r)
train.errors[i] <- mean(train.errors.r)
i <- i + 1
}
p_values <- ns[-1]
plot_df <- data.frame(p = p_values+1, cv.error = gen.errors[-1], train.error = train.errors[-1])
ggplot(plot_df, aes(x = p)) +
geom_line(aes(y = cv.error, color = "CV MSE")) +
geom_point(aes(y = cv.error, color = "CV MSE")) +
geom_line(aes(y = train.error, color = "Train MSE")) +
geom_point(aes(y = train.error, color = "Train MSE")) +
geom_hline(yintercept = 1, linetype = "solid", color = "black", size = 1.5) +
xlab("beta的维数") +
ylab("cross validation的MSE") +
ggtitle("p<n(1-1/K)情形(K=5,n=200,r=10)") +
theme(plot.title = element_text(hjust = 0.5)) +
scale_color_manual(values = c("CV MSE" = "red", "Train MSE" = "blue")) +
labs(color = "Error Types")
\(p=160\)的情况对应训练集样本量为160,这个时候MSE非常大。下面从\(p=1,\cdots,400\)开始再画一些点,并且限制\(y\)轴高度于\((0,100)\)。
library(ggplot2)
library(MASS)
set.seed(1234)
r <- 5
K <- 5
n <- 200
d <- 200
errors <- numeric(d)
gen.errors <- numeric(d)
train.errors <- numeric(d)
ns <- seq(1, 2*d, 2)
i <- 1
for (p in ns) {
X <- matrix(rnorm(n * p), n, p)
X <- cbind(rep(1, n), X)
beta <- rnorm(p + 1)
y <- X %*% beta + rnorm(n)
train.errors.r <- rep(0, r)
temp.errors.r <- rep(0, r)
for (j in 1:r) {
indices <- sample(n)
break.points <- round(seq(1, n, length = K + 1))
temp.errors.k <- rep(0, K)
train.errors.k <- rep(0, K)
for (k in 1:K) {
chunk <- indices[break.points[k]:(break.points[k + 1] - 1)]
X.train <- X[-chunk,]
y.train <- y[-chunk]
X.test <- X[chunk,]
y.test <- y[chunk]
beta.train <- ginv(X.train) %*% y.train # M-P广义逆
train.errors.k[k] <- sum((y.train - X.train %*% beta.train)^2) / length(y.train)
temp.errors.k[k] <- sum((y.test - X.test %*% beta.train)^2) / length(y.test)
}
temp.errors.r[j] <- mean(temp.errors.k)
train.errors.r[j] <- mean(train.errors.k)
}
gen.errors[i] <- mean(temp.errors.r)
train.errors[i] <- mean(train.errors.r)
i <- i + 1
}
p_values <- ns[-1]
plot_df <- data.frame(p = p_values+1, cv.error = gen.errors[-1], train.error = train.errors[-1])
ggplot(plot_df, aes(x = p)) +
geom_line(aes(y = cv.error, color = "CV MSE")) +
geom_point(aes(y = cv.error, color = "CV MSE")) +
geom_line(aes(y = train.error, color = "Train MSE")) +
geom_point(aes(y = train.error, color = "Train MSE")) +
xlab("beta的维数") +
ylab("cross validation的MSE") +
ggtitle("低维情形(K=5,n=200,r=5)") +
theme(plot.title = element_text(hjust = 0.5)) +
scale_color_manual(values = c("CV MSE" = "red", "Train MSE" = "blue")) +
labs(color = "Error Types") + ylim(0, 100) + geom_vline(xintercept = 160, linetype = "solid", color = "black", size = 1)
下面是\(p>200\),步长为200的图示。
library(ggplot2)
library(MASS)
set.seed(1234)
r <- 5
K <- 5
n <- 200
d <- 30
gen.errors <- numeric(d + 1)
train.errors <- numeric(d + 1)
ns <- floor(seq(n - 1, n - 1 + d * n, n))
i <- 1
for (p in ns) {
X <- matrix(rnorm(n * p), n, p)
X <- cbind(rep(1, n), X)
beta <- rnorm(p + 1)
y <- X %*% beta + rnorm(n)
train.errors.r <- rep(0, r)
temp.errors.r <- rep(0, r)
for (j in 1:r) {
indices <- sample(n)
break.points <- round(seq(1, n, length = K + 1))
temp.errors.k <- rep(0, K)
train.errors.k <- rep(0, K)
for (k in 1:K) {
chunk <- indices[break.points[k]:(break.points[k + 1] - 1)]
X.train <- X[-chunk,]
y.train <- y[-chunk]
X.test <- X[chunk,]
y.test <- y[chunk]
beta.train <- ginv(X.train) %*% y.train # M-P广义逆
train.errors.k[k] <- sum((y.train - X.train %*% beta.train)^2) / length(y.train)
temp.errors.k[k] <- sum((y.test - X.test %*% beta.train)^2) / length(y.test)
}
temp.errors.r[j] <- mean(temp.errors.k)
train.errors.r[j] <- mean(train.errors.k)
}
gen.errors[i] <- mean(temp.errors.r)
train.errors[i] <- mean(train.errors.r)
i <- i + 1
}
p_values <- ns[-1]
plot_df <- data.frame(p = p_values+1, cv.error = gen.errors[-1], train.error = train.errors[-1])
ggplot(plot_df, aes(x = p)) +
geom_line(aes(y = cv.error, color = "CV MSE")) +
geom_point(aes(y = cv.error, color = "CV MSE")) +
geom_line(aes(y = train.error, color = "Train MSE")) +
geom_point(aes(y = train.error, color = "Train MSE")) +
xlab("beta的维数") +
ylab("cross validation的MSE") +
ggtitle("p>n情形(K=5,n=200,r=5)") +
theme(plot.title = element_text(hjust = 0.5)) +
scale_color_manual(values = c("CV MSE" = "red", "Train MSE" = "blue")) +
labs(color = "Error Types")
MSE在维数较高的时候呈现近似线性的增长。
根据ETH Zürich的讲义,利用bootstrap方法估计泛化误差有两种。第一种是进行\(n\)次放回抽样,直接在训练集上面计算误差然后蒙特卡洛,第二种是Out-of-bootstrap sample for estimation of the generalization error,即重复抽样\(B\)次,每一次在抽样出来的\(n\)个(可能含有重复值的)样本上拟合,在没被抽中的集合上面做测试计算MSE。下面来模拟之。
低维情形:
library(ggplot2)
library(MASS)
set.seed(1234)
B <- 10
n <- 200
d <- 400
errors <- numeric(d + 1)
train.errors <- numeric(d + 1)
gen.errors <- numeric(d + 1)
ns <- 1:(d+1)
#ns <- floor(seq(1,n-1,length=d+1))
i <- 1
for (p in ns) {
X <- matrix(rnorm(n * p), n, p)
X <- cbind(rep(1, n), X)
beta <- rnorm(p + 1)
y <- X %*% beta + rnorm(n)
train.errors.B <- rep(0, B)
temp.errors.B <- rep(0, B)
for (j in 1:B) {
indices <- sample(1:n, n, replace = TRUE)
left.indices <- setdiff(1:n, unique(indices))
X.b <- X[indices, ]
y.b <- y[indices]
X.left <- X[left.indices, ]
y.left <- y[left.indices]
beta.train <- ginv(X.b) %*% y.b # M-P广义逆
train.errors.B[j] <- sum((y.b - X.b %*% beta.train)^2) / length(y.b)
temp.errors.B[j] <- sum((y.left - X.left %*% beta.train)^2) / length(y.left)
}
gen.errors[i] <- mean(temp.errors.B)
train.errors[i] <- mean(train.errors.B)
i <- i + 1
}
p_values <- ns[-1]
plot_df <- data.frame(p = p_values+1, bs.error = gen.errors[-1], train.error = train.errors[-1])
ggplot(plot_df, aes(x = p)) +
geom_line(aes(y = bs.error, color = "Bootstrap MSE")) +
geom_point(aes(y = bs.error, color = "Bootstrap MSE")) +
geom_line(aes(y = train.error, color = "Train MSE")) +
geom_point(aes(y = train.error, color = "Train MSE")) +
geom_hline(yintercept = 1, linetype = "solid", color = "black", size = 1.5) +
xlab("beta的维数") +
ylab("Bootstrap的MSE") +
ggtitle("低维情形(K=5,n=200,B=20),限制y取值小于100") +
theme(plot.title = element_text(hjust = 0.5)) +
scale_color_manual(values = c("Bootstrap MSE" = "red", "Train MSE" = "blue")) +
labs(color = "Error Types") + ylim(0, 100)
library(ggplot2)
library(MASS)
set.seed(1234)
B <- 10
n <- 200
d <- 400
errors <- numeric(d + 1)
train.errors <- numeric(d + 1)
gen.errors <- numeric(d + 1)
ns <- 1:(d+1)
#ns <- floor(seq(1,n-1,length=d+1))
i <- 1
for (p in ns) {
X <- matrix(rnorm(n * p), n, p)
X <- cbind(rep(1, n), X)
beta <- rnorm(p + 1)
y <- X %*% beta + rnorm(n)
train.errors.B <- rep(0, B)
temp.errors.B <- rep(0, B)
for (j in 1:B) {
indices <- sample(1:n, n, replace = TRUE)
left.indices <- setdiff(1:n, unique(indices))
X.b <- X[indices, ]
y.b <- y[indices]
X.left <- X[left.indices, ]
y.left <- y[left.indices]
beta.train <- ginv(X.b) %*% y.b # M-P广义逆
train.errors.B[j] <- sum((y.b - X.b %*% beta.train)^2) / length(y.b)
temp.errors.B[j] <- sum((y.left - X.left %*% beta.train)^2) / length(y.left)
}
gen.errors[i] <- mean(temp.errors.B)
train.errors[i] <- mean(train.errors.B)
i <- i + 1
}
p_values <- ns[-1]
plot_df <- data.frame(p = p_values+1, bs.error = gen.errors[-1], train.error = train.errors[-1])
ggplot(plot_df, aes(x = p)) +
geom_line(aes(y = bs.error, color = "Bootstrap MSE")) +
geom_point(aes(y = bs.error, color = "Bootstrap MSE")) +
geom_line(aes(y = train.error, color = "Train MSE")) +
geom_point(aes(y = train.error, color = "Train MSE")) +
geom_hline(yintercept = 1, linetype = "solid", color = "black", size = 1.5) +
xlab("beta的维数") +
ylab("Bootstrap的MSE") +
ggtitle("低维情形(K=5,n=200,B=20)") +
theme(plot.title = element_text(hjust = 0.5)) +
scale_color_manual(values = c("Bootstrap MSE" = "red", "Train MSE" = "blue")) +
labs(color = "Error Types")
发现在\(p+1=126\)时有一点处估计的泛化误差不收敛,而正好\(n\cdot(1-1/e)\approx126\),可能这不是巧合。而且在该值附近泛化误差的估计有局部的先增后减现象,后续便基本呈线性地增长。下面考察高维情形:
library(ggplot2)
library(MASS)
set.seed(1234)
B <- 20
n <- 200
d <- 30
errors <- numeric(d + 1)
train.errors <- numeric(d + 1)
gen.errors <- numeric(d + 1)
ns <- floor(seq(n - 1, n - 1 + d * n, n))
i <- 1
for (p in ns) {
X <- matrix(rnorm(n * p), n, p)
X <- cbind(rep(1, n), X)
beta <- rnorm(p + 1)
y <- X %*% beta + rnorm(n)
train.errors.B <- rep(0, B)
temp.errors.B <- rep(0, B)
for (j in 1:B) {
indices <- sample(1:n, n, replace = TRUE)
left.indices <- setdiff(1:n, unique(indices))
X.b <- X[indices, ]
y.b <- y[indices]
X.left <- X[left.indices, ]
y.left <- y[left.indices]
beta.train <- ginv(X.b) %*% y.b # M-P广义逆
train.errors.B[j] <- sum((y.b - X.b %*% beta.train)^2) / length(y.b)
temp.errors.B[j] <- sum((y.left - X.left %*% beta.train)^2) / length(y.left)
}
gen.errors[i] <- mean(temp.errors.B)
train.errors[i] <- mean(train.errors.B)
i <- i + 1
}
p_values <- ns[-1]
plot_df <- data.frame(p = p_values+1, bs.error = gen.errors[-1], train.error = train.errors[-1])
ggplot(plot_df, aes(x = p)) +
geom_line(aes(y = bs.error, color = "Bootstrap MSE")) +
geom_point(aes(y = bs.error, color = "Bootstrap MSE")) +
geom_line(aes(y = train.error, color = "Train MSE")) +
geom_point(aes(y = train.error, color = "Train MSE")) +
xlab("beta的维数") +
ylab("Bootstrap的MSE") +
ggtitle("高维情形(K=5,n=200,B=20)") +
theme(plot.title = element_text(hjust = 0.5)) +
scale_color_manual(values = c("Bootstrap MSE" = "red", "Train MSE" = "blue")) +
labs(color = "Error Types")
综合这些结果,除了在\(p+1=n\cdot(1-1/e)\approx126\)处的异常情况,交叉验证和Bootstrap方法对泛化误差的估计是差不多的。