1 多重线性回归模型与F检验

导入成年人身高-体重数据:

> hw <- read.table(file="height-weight.txt",header=T,sep="\t",fileEncoding="UTF-8")

查看前5行:

> head(hw,5)
##   sex weight height
## 1   1     77   1.82
## 2   0     58   1.61
## 3   0     53   1.61
## 4   1     68   1.77
## 5   0     59   1.57

1.1 简单回归

> attach(hw)
> plot(log(height),log(weight),col=sex+1)
> fit.all=lm(log(weight)~log(height),data=hw)
> fit.all
## 
## Call:
## lm(formula = log(weight) ~ log(height), data = hw)
## 
## Coefficients:
## (Intercept)  log(height)  
##       2.599        2.930
> fit.female=lm(log(weight)~log(height),data=hw,subset=sex==0)
> fit.female
## 
## Call:
## lm(formula = log(weight) ~ log(height), data = hw, subset = sex == 
##     0)
## 
## Coefficients:
## (Intercept)  log(height)  
##       3.120        1.833
> fit.male=lm(log(weight)~log(height),data=hw,subset=sex==1)
> fit.male
## 
## Call:
## lm(formula = log(weight) ~ log(height), data = hw, subset = sex == 
##     1)
## 
## Coefficients:
## (Intercept)  log(height)  
##       2.982        2.318
> abline(fit.all,col=3)
> abline(fit.female,col=1)
> abline(fit.male,col=2)
> legend(0.4,4.8,c("all","female","male"),col=c(3,1,2),lty=c(1,1,1))

> summary(fit.female)
## 
## Call:
## lm(formula = log(weight) ~ log(height), data = hw, subset = sex == 
##     0)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.28367 -0.05921 -0.01299  0.06503  0.31279 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3.1203     0.1413   22.08  < 2e-16 ***
## log(height)   1.8333     0.2829    6.48 2.74e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.103 on 109 degrees of freedom
## Multiple R-squared:  0.2781, Adjusted R-squared:  0.2715 
## F-statistic: 41.99 on 1 and 109 DF,  p-value: 2.743e-09

1.2 多重回归

> plot(log(height),log(weight),col=sex+1)
> fit=lm(log(weight)~log(height)+sex,data=hw)
> coef(fit)
## (Intercept) log(height)         sex 
##   3.0087053   2.0571556   0.1240784
> abline(3.0087, 2.0572,col=2,lty=2,lwd=2)
> abline(3.0087+ 0.1241, 2.0572,col=3,lty=2,lwd=2)

> names(fit)
##  [1] "coefficients"  "residuals"     "effects"       "rank"         
##  [5] "fitted.values" "assign"        "qr"            "df.residual"  
##  [9] "xlevels"       "call"          "terms"         "model"

1.3 summary 函数

> summary(fit)
## 
## Call:
## lm(formula = log(weight) ~ log(height) + sex, data = hw)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.28715 -0.06964 -0.01143  0.07636  0.43717 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.00871    0.11560  26.028  < 2e-16 ***
## log(height)  2.05716    0.23092   8.909 3.55e-16 ***
## sex          0.12408    0.02427   5.113 7.51e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1145 on 196 degrees of freedom
## Multiple R-squared:  0.6601, Adjusted R-squared:  0.6567 
## F-statistic: 190.4 on 2 and 196 DF,  p-value: < 2.2e-16
> #误差方差估计
> e=resid(fit)
> sum(e^2)/(199-3)
## [1] 0.01312098
> #复相关系数估计
> y.hat=fitted(fit)
> y=log(hw[,"weight"])
> R2_1=cor(y.hat,y)^2
> R2_2=var(y.hat)/var(y)
> #回归方程显著性检验
> R2=var(y.hat)/var(y)
> n=nrow(hw)
> p=ncol(hw) 
> k=2 #检验两个参数
> F=(n-p)/k*R2/(1-R2)
> F
## [1] 190.3614
> #提取部分信息
> summary(fit)$fstatistic
##    value    numdf    dendf 
## 190.3614   2.0000 196.0000
> summary(fit)$r.squared
## [1] 0.6601487

1.4 F 检验

> # H0:b=c=0
> model0=lm(log(weight)~1,data=hw)
> fit2=lm(log(weight)~log(height)+sex,data=hw)
> anova(model0,fit2)
## Analysis of Variance Table
## 
## Model 1: log(weight) ~ 1
## Model 2: log(weight) ~ log(height) + sex
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1    198 7.5672                                  
## 2    196 2.5717  2    4.9955 190.36 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
> # H0: b=c
> model.full=lm(log(weight)~log(height)+sex,data=hw)
> z=log(hw[,"height"])+hw[,"sex"]
> model.null=lm(log(weight)~z,data=hw)
> anova(model.null,model.full)
## Analysis of Variance Table
## 
## Model 1: log(weight) ~ z
## Model 2: log(weight) ~ log(height) + sex
##   Res.Df    RSS Df Sum of Sq      F    Pr(>F)    
## 1    197 3.3601                                  
## 2    196 2.5717  1   0.78836 60.084 4.817e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

1.5 练习题

1.5.1 练习1

例1中给出的结果为: \[ \log(weight)=3.0087+2\log(height)+0.1241sex+\varepsilon, \varepsilon \sim N(0,0.0131) \] 可以写成: \[ log(\frac{weight}{height^2})=3.0087+0.1241sex+\varepsilon \] 女性为: \[ log(\frac{weight}{height^2})=3.0087+\varepsilon \] 男性为: \[ log(\frac{weight}{height^2})=3.1238+\varepsilon \]

> delta=qnorm(0.8,0,0.0131)
> #female
> low0=exp(3.0087-delta)
> high0=exp(3.0087+delta)
> #male
> low1=exp(3.1238-delta)
> high1=exp(3.1238+delta)
> print("Interval of normal:")
## [1] "Interval of normal:"
> cat("female:",c(low0,high0))
## female: 20.03889 20.48566
> cat("male:",c(low1,high1))
## male: 22.48334 22.98462

1.5.2 练习2

> install.packages("alr4") # 安装
> library(alr4)
## 载入需要的程辑包:car
## 载入需要的程辑包:carData
## 载入需要的程辑包:effects
## lattice theme set by effectsTheme()
## See ?effectsTheme for details.

1.5.2.1 检验男女老师工资是否相同

> t.test(salary~sex,data=salary,var.equal=T)
## 
##  Two Sample t-test
## 
## data:  salary by sex
## t = 1.8474, df = 50, p-value = 0.0706
## alternative hypothesis: true difference in means between group Male and group Female is not equal to 0
## 95 percent confidence interval:
##  -291.257 6970.550
## sample estimates:
##   mean in group Male mean in group Female 
##             24696.79             21357.14
> fit=lm(salary~sex,data=salary)
> summary(fit)
## 
## Call:
## lm(formula = salary ~ sex, data = salary)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8602.8 -4296.6  -100.8  3513.1 16687.9 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    24697        938  26.330   <2e-16 ***
## sexFemale      -3340       1808  -1.847   0.0706 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5782 on 50 degrees of freedom
## Multiple R-squared:  0.0639, Adjusted R-squared:  0.04518 
## F-statistic: 3.413 on 1 and 50 DF,  p-value: 0.0706

首先,以上两个检验得到的结果是相同的。

以上得到的结果中,sex的系数即为男女平均工资的差异,为3340元,男性平均工资较高。

p值为0.07<0.1,结果是显著的。该结论可以说明有歧视女性的现象,但可能存在干扰因素。

1.5.2.2 职称与工资以及性别相关的证据

> #数据预处理
> dat=salary
> dat$rank=as.character(dat$rank)
> dat$rank[which(dat$rank=="Asst")]=1
> dat$rank[which(dat$rank=="Assoc")]=2
> dat$rank[which(dat$rank=="Prof")]=3
> dat$rank=as.numeric(dat$rank)
> summary(lm(salary~rank,data=dat))#rank对salary有显著影响
## 
## Call:
## lm(formula = salary ~ rank, data = dat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5071.5 -1738.3  -443.4  1337.2  8523.5 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  11663.1     1066.9   10.93 7.38e-15 ***
## rank          5952.8      482.8   12.33  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2973 on 50 degrees of freedom
## Multiple R-squared:  0.7525, Adjusted R-squared:  0.7476 
## F-statistic:   152 on 1 and 50 DF,  p-value: < 2.2e-16
> #用卡方检验检验 rank 和 sex 是否独立
> #chisq.test(salary[,"rank"],salary[,"sex"])#卡方近似算法有可能不准
> summary(lm(rank~sex,data=dat))#sex对rank有显著影响
## 
## Call:
## lm(formula = rank ~ sex, data = dat)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.1579 -0.7143 -0.1579  0.8421  1.2857 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   2.1579     0.1375  15.696   <2e-16 ***
## sexFemale    -0.4436     0.2650  -1.674      0.1    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8475 on 50 degrees of freedom
## Multiple R-squared:  0.05309,    Adjusted R-squared:  0.03415 
## F-statistic: 2.803 on 1 and 50 DF,  p-value: 0.1003

p<2e-16,工资和职称强相关。

p=0.1,若取水平略大于0.1,拒绝rank和sex不相关的原假设。

1.5.2.3 排除干扰因素rank回归

验证:不论 Rank 属于哪个类,Salary 与 Sex 的关系保持不变。

> lm(salary~sex,data=salary,subset=(rank=="Asst"))
## 
## Call:
## lm(formula = salary ~ sex, data = salary, subset = (rank == "Asst"))
## 
## Coefficients:
## (Intercept)    sexFemale  
##     17919.6       -339.6
> lm(salary~sex,data=salary,subset=(rank=="Assoc"))
## 
## Call:
## lm(formula = salary ~ sex, data = salary, subset = (rank == "Assoc"))
## 
## Coefficients:
## (Intercept)    sexFemale  
##       23444        -1874
> lm(salary~sex,data=salary,subset=(rank=="Prof"))
## 
## Call:
## lm(formula = salary ~ sex, data = salary, subset = (rank == "Prof"))
## 
## Coefficients:
## (Intercept)    sexFemale  
##       29872        -1067

比较三个回归系数值,发现不同 Rank 中,性别的系数差异仍然较大,所以我们不认为上述事实成立。我们依旧认为性别对工资有显著影响,这是排除了职位之后的论断。

1.5.2.4 应用多重线性回归模型

> model.full2=lm(salary~sex+rank+year+degree+ysdeg,data=salary)
> model.null2=lm(salary~year+degree+ysdeg,data=salary)
> anova(model.null2,model.full2)
## Analysis of Variance Table
## 
## Model 1: salary ~ year + degree + ysdeg
## Model 2: salary ~ sex + rank + year + degree + ysdeg
##   Res.Df       RSS Df Sum of Sq      F    Pr(>F)    
## 1     48 672102002                                  
## 2     45 258858365  3 413243637 23.946 2.053e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

p值很小,检验结果表示原假设应被拒绝,即 Sex 和 Rank 有显著影响。

2 蒙特卡洛简介

2.1 单位圆面积

> n=1e5
> x=runif(n,-1,1)
> y=runif(n,-1,1)
> m=sum(x^2+y^2<=1)
> p=m/n
> S=4*p
> cat("单位圆面积:",S)
## 单位圆面积: 3.1352

2.2 近似正态

> x=rnorm(n)
> m=mean(sqrt(abs(x)))
> var=var(sqrt(abs(x)))
> cat("mean=",m,"\n","var=",var)
## mean= 0.8231982 
##  var= 0.1225058
> hist(x)

2.3 数值积分

> x=rnorm(n)
> y=sqrt(2*pi)*exp(-x^4+x^2/2)
> mean(y)
## [1] 1.813786

2.4 Importance sampling

> #初始想法
> n=1e5
> x=rnorm(n)
> sum(x[x>5^2])/n
## [1] 0
> #改进
> x=rnorm(n)+5
> h=x^2*exp(-x^2/2+(x-5)^2/2)
> sum(h[which(x>5)])/n
## [1] 7.764954e-06

2.5 模拟实验评估数据分析方法

> a=2;b=1;sigma=1
> n=100
> x=c(rep(1,n/2),rep(0,n/2))
> z=(1:n)/n
> 
> N=1e4
> B.hat=B.tilde=NULL
> for (k in 1:N){
+   epsilon=rnorm(n)*sigma
+   y=a+b*x+epsilon #模拟数据生成
+   # 计算两个估计
+   x.bar=mean(x);z.bar=mean(z)
+   b.hat=sum((x-x.bar)*y)/sum((x-x.bar)^2)
+   b.tilde=sum((z-z.bar)*y)/sum((x-x.bar)*(z-z.bar))
+   B.hat=c(B.hat,b.hat)
+   B.tilde=c(B.tilde,b.tilde)
+ }#simulation
> cat("mean of B.hat:",mean(B.hat),"mean of B.tilde:",mean(B.tilde),"\n","var.true=",sigma^2/sum((x-x.bar)^2),"\nvar of B.hat:",var(B.hat),"\nvar of B.tilde: ",var(B.tilde))
## mean of B.hat: 0.9990877 mean of B.tilde: 0.9985881 
##  var.true= 0.04 
## var of B.hat: 0.03827255 
## var of B.tilde:  0.05116159

可以看出 \(\tilde{b}\) 依然是无偏估计,但是方差比LS估计大。

3 广义最小二乘,迭代加权最小二乘

3.1 练习题

3.1.1 练习3

> df=read.table("http://staff.ustc.edu.cn/~ynyang/2022/lab/cp.txt",head=TRUE)
> plot(df$x,df$y)

> weights_df=c(rep(1,times=100),rep(1/4,times=100))
> lm( y~x ,data=df,weights=weights_df)#加权LS估计
## 
## Call:
## lm(formula = y ~ x, data = df, weights = weights_df)
## 
## Coefficients:
## (Intercept)            x  
##       3.995        1.931

3.1.2 练习4

转变点检测

算法思想:设断点为 \(m\), 损失函数定义为: \[ loss(m)=m\log(\sigma^2)+(n-m)\log(\tau^2)+\Sigma_{i=1}^m\frac{(y_i-x_i^T\beta)^2}{\sigma^2}+\Sigma_{i=m+1}^n\frac{(y_i-x_i^T\beta)^2}{\tau^2} \] 对每个 \(m\),都可以用IRLS迭代求得最优的回归系数和方差的估计,代入求得\(loss(m)\), 然后选取使loss最小的m作为断点的估计值。

> x=df$x
> y=df$y
> X=cbind(rep(1,times=200),x)
> n=length(df$x)
> epsilon=1e-10
> 
> fit=lm( y~x ,data=df)#OLS估计
> beta_0=fit$coefficients
> resid=fit$residuals #残差
> 
> m_best=2
> sigma2=sum(resid^2)/(n-1)
> loss_min=sum(resid^2)/sigma2+n*log(sigma2)
> for(m in 2:198){
+   k=0
+   m1=m+1#用于分割
+   while(k<100){#IRLS
+     sigma2=sum(resid[1:m]^2)/m#前一部分方差估计
+     tau2=sum(resid[m1:n]^2)/(n-m)#后一部分方差估计
+     G=diag(c(rep(sigma2,times=m),rep(tau2,times=n-m)))#方差矩阵估计
+     
+     beta=solve(t(X)%*%solve(G)%*%X)%*%t(X)%*%solve(G)%*%y #系数估计
+     
+     
+     diff=sum((beta-beta_0)^2)
+     beta_0=beta
+     resid=y-X%*%beta#更新残差
+   
+     if(diff<epsilon) {break} #终止条件
+     k=k+1
+   }#while
+   loss=sum(resid[1:m]^2)/sigma2+sum(resid[m1:n]^2)/tau2+m*log(sigma2)+(n-m)*log(tau2)
+   #if(is.nan(loss)) {print("error");break}
+   if(loss<loss_min){
+     loss_min=loss
+     m_best=m
+   }#if
+ }#for
> cat("a=",beta[1],"b=",beta[2],"\n","sigma2=",sigma2,"tau2",tau2,"\n","m=",m_best)
## a= 3.852823 b= 2.346893 
##  sigma2= 1.596252 tau2 0.08910093 
##  m= 100

3.1.3 练习5

> library(alr4)
> df=brains
> fit=lm(log(BrainWt)~log(BodyWt),data=df)
> print(fit$coefficients)#最小二乘估计
## (Intercept) log(BodyWt) 
##   2.1347883   0.7516861
> x=log(df$BodyWt)
> err=fit$residuals
> plot(x,err)

从图中可以看出,残差基本均匀分布在0上下,无非线性趋势。但是方差齐性假设不太合理,不同x对应的方差可能不同。

> y=df$BrainWt
> y=log(y)
> y[14]=100
> fit_err=lm(y~x)
> print(fit_err$coefficients)
## (Intercept)           x 
##   5.3004745  -0.3855299

与前者结果差别很大。

最小一乘法:

> X=cbind(rep(1,times=length(x)),x)
> beta_0=fit_err$coefficients
> err=fit_err$residuals
> G=1/abs(err)
> G=diag(G)
> 
> k=0
> while (k<1e2) {#最小一乘法迭代
+   beta=solve(t(X)%*%G%*%X)%*%t(X)%*%G%*%y
+   diff=sum((beta-beta_0)^2)
+   if(diff<1e-5) {cat("iteration:",k,"\n","beta=",beta);break}
+   err=abs(y-X%*%beta)
+   err[err<0.0001]=0.0001
+   err=c(err)
+   G=diag(1/err)
+   beta_0=beta
+   
+   k=k+1
+ }
## iteration: 16 
##  beta= 2.094453 0.7447085
> plot(x,y)
> abline(fit_err)
> abline(a=beta[1],b=beta[2])

结论:错误记录对于最小二乘影响很大,对最小一乘影响较小!

因此我们验证了最小一乘方法比最小二乘法稳健,它对异常的响应值不太敏感。