导入成年人身高-体重数据:
> hw <- read.table(file="height-weight.txt",header=T,sep="\t",fileEncoding="UTF-8")
查看前5行:
> head(hw,5)
## sex weight height
## 1 1 77 1.82
## 2 0 58 1.61
## 3 0 53 1.61
## 4 1 68 1.77
## 5 0 59 1.57
> attach(hw)
> plot(log(height),log(weight),col=sex+1)
> fit.all=lm(log(weight)~log(height),data=hw)
> fit.all
##
## Call:
## lm(formula = log(weight) ~ log(height), data = hw)
##
## Coefficients:
## (Intercept) log(height)
## 2.599 2.930
> fit.female=lm(log(weight)~log(height),data=hw,subset=sex==0)
> fit.female
##
## Call:
## lm(formula = log(weight) ~ log(height), data = hw, subset = sex ==
## 0)
##
## Coefficients:
## (Intercept) log(height)
## 3.120 1.833
> fit.male=lm(log(weight)~log(height),data=hw,subset=sex==1)
> fit.male
##
## Call:
## lm(formula = log(weight) ~ log(height), data = hw, subset = sex ==
## 1)
##
## Coefficients:
## (Intercept) log(height)
## 2.982 2.318
> abline(fit.all,col=3)
> abline(fit.female,col=1)
> abline(fit.male,col=2)
> legend(0.4,4.8,c("all","female","male"),col=c(3,1,2),lty=c(1,1,1))
> summary(fit.female)
##
## Call:
## lm(formula = log(weight) ~ log(height), data = hw, subset = sex ==
## 0)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.28367 -0.05921 -0.01299 0.06503 0.31279
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.1203 0.1413 22.08 < 2e-16 ***
## log(height) 1.8333 0.2829 6.48 2.74e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.103 on 109 degrees of freedom
## Multiple R-squared: 0.2781, Adjusted R-squared: 0.2715
## F-statistic: 41.99 on 1 and 109 DF, p-value: 2.743e-09
> plot(log(height),log(weight),col=sex+1)
> fit=lm(log(weight)~log(height)+sex,data=hw)
> coef(fit)
## (Intercept) log(height) sex
## 3.0087053 2.0571556 0.1240784
> abline(3.0087, 2.0572,col=2,lty=2,lwd=2)
> abline(3.0087+ 0.1241, 2.0572,col=3,lty=2,lwd=2)
> names(fit)
## [1] "coefficients" "residuals" "effects" "rank"
## [5] "fitted.values" "assign" "qr" "df.residual"
## [9] "xlevels" "call" "terms" "model"
> summary(fit)
##
## Call:
## lm(formula = log(weight) ~ log(height) + sex, data = hw)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.28715 -0.06964 -0.01143 0.07636 0.43717
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.00871 0.11560 26.028 < 2e-16 ***
## log(height) 2.05716 0.23092 8.909 3.55e-16 ***
## sex 0.12408 0.02427 5.113 7.51e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1145 on 196 degrees of freedom
## Multiple R-squared: 0.6601, Adjusted R-squared: 0.6567
## F-statistic: 190.4 on 2 and 196 DF, p-value: < 2.2e-16
> #误差方差估计
> e=resid(fit)
> sum(e^2)/(199-3)
## [1] 0.01312098
> #复相关系数估计
> y.hat=fitted(fit)
> y=log(hw[,"weight"])
> R2_1=cor(y.hat,y)^2
> R2_2=var(y.hat)/var(y)
> #回归方程显著性检验
> R2=var(y.hat)/var(y)
> n=nrow(hw)
> p=ncol(hw)
> k=2 #检验两个参数
> F=(n-p)/k*R2/(1-R2)
> F
## [1] 190.3614
> #提取部分信息
> summary(fit)$fstatistic
## value numdf dendf
## 190.3614 2.0000 196.0000
> summary(fit)$r.squared
## [1] 0.6601487
> # H0:b=c=0
> model0=lm(log(weight)~1,data=hw)
> fit2=lm(log(weight)~log(height)+sex,data=hw)
> anova(model0,fit2)
## Analysis of Variance Table
##
## Model 1: log(weight) ~ 1
## Model 2: log(weight) ~ log(height) + sex
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 198 7.5672
## 2 196 2.5717 2 4.9955 190.36 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
> # H0: b=c
> model.full=lm(log(weight)~log(height)+sex,data=hw)
> z=log(hw[,"height"])+hw[,"sex"]
> model.null=lm(log(weight)~z,data=hw)
> anova(model.null,model.full)
## Analysis of Variance Table
##
## Model 1: log(weight) ~ z
## Model 2: log(weight) ~ log(height) + sex
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 197 3.3601
## 2 196 2.5717 1 0.78836 60.084 4.817e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
例1中给出的结果为: \[ \log(weight)=3.0087+2\log(height)+0.1241sex+\varepsilon, \varepsilon \sim N(0,0.0131) \] 可以写成: \[ log(\frac{weight}{height^2})=3.0087+0.1241sex+\varepsilon \] 女性为: \[ log(\frac{weight}{height^2})=3.0087+\varepsilon \] 男性为: \[ log(\frac{weight}{height^2})=3.1238+\varepsilon \]
> delta=qnorm(0.8,0,0.0131)
> #female
> low0=exp(3.0087-delta)
> high0=exp(3.0087+delta)
> #male
> low1=exp(3.1238-delta)
> high1=exp(3.1238+delta)
> print("Interval of normal:")
## [1] "Interval of normal:"
> cat("female:",c(low0,high0))
## female: 20.03889 20.48566
> cat("male:",c(low1,high1))
## male: 22.48334 22.98462
> install.packages("alr4") # 安装
> library(alr4)
## 载入需要的程辑包:car
## 载入需要的程辑包:carData
## 载入需要的程辑包:effects
## lattice theme set by effectsTheme()
## See ?effectsTheme for details.
> t.test(salary~sex,data=salary,var.equal=T)
##
## Two Sample t-test
##
## data: salary by sex
## t = 1.8474, df = 50, p-value = 0.0706
## alternative hypothesis: true difference in means between group Male and group Female is not equal to 0
## 95 percent confidence interval:
## -291.257 6970.550
## sample estimates:
## mean in group Male mean in group Female
## 24696.79 21357.14
> fit=lm(salary~sex,data=salary)
> summary(fit)
##
## Call:
## lm(formula = salary ~ sex, data = salary)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8602.8 -4296.6 -100.8 3513.1 16687.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 24697 938 26.330 <2e-16 ***
## sexFemale -3340 1808 -1.847 0.0706 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5782 on 50 degrees of freedom
## Multiple R-squared: 0.0639, Adjusted R-squared: 0.04518
## F-statistic: 3.413 on 1 and 50 DF, p-value: 0.0706
首先,以上两个检验得到的结果是相同的。
以上得到的结果中,sex的系数即为男女平均工资的差异,为3340元,男性平均工资较高。
p值为0.07<0.1,结果是显著的。该结论可以说明有歧视女性的现象,但可能存在干扰因素。
> #数据预处理
> dat=salary
> dat$rank=as.character(dat$rank)
> dat$rank[which(dat$rank=="Asst")]=1
> dat$rank[which(dat$rank=="Assoc")]=2
> dat$rank[which(dat$rank=="Prof")]=3
> dat$rank=as.numeric(dat$rank)
> summary(lm(salary~rank,data=dat))#rank对salary有显著影响
##
## Call:
## lm(formula = salary ~ rank, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5071.5 -1738.3 -443.4 1337.2 8523.5
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11663.1 1066.9 10.93 7.38e-15 ***
## rank 5952.8 482.8 12.33 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2973 on 50 degrees of freedom
## Multiple R-squared: 0.7525, Adjusted R-squared: 0.7476
## F-statistic: 152 on 1 and 50 DF, p-value: < 2.2e-16
> #用卡方检验检验 rank 和 sex 是否独立
> #chisq.test(salary[,"rank"],salary[,"sex"])#卡方近似算法有可能不准
> summary(lm(rank~sex,data=dat))#sex对rank有显著影响
##
## Call:
## lm(formula = rank ~ sex, data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.1579 -0.7143 -0.1579 0.8421 1.2857
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.1579 0.1375 15.696 <2e-16 ***
## sexFemale -0.4436 0.2650 -1.674 0.1
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8475 on 50 degrees of freedom
## Multiple R-squared: 0.05309, Adjusted R-squared: 0.03415
## F-statistic: 2.803 on 1 and 50 DF, p-value: 0.1003
p<2e-16,工资和职称强相关。
p=0.1,若取水平略大于0.1,拒绝rank和sex不相关的原假设。
验证:不论 Rank 属于哪个类,Salary 与 Sex 的关系保持不变。
> lm(salary~sex,data=salary,subset=(rank=="Asst"))
##
## Call:
## lm(formula = salary ~ sex, data = salary, subset = (rank == "Asst"))
##
## Coefficients:
## (Intercept) sexFemale
## 17919.6 -339.6
> lm(salary~sex,data=salary,subset=(rank=="Assoc"))
##
## Call:
## lm(formula = salary ~ sex, data = salary, subset = (rank == "Assoc"))
##
## Coefficients:
## (Intercept) sexFemale
## 23444 -1874
> lm(salary~sex,data=salary,subset=(rank=="Prof"))
##
## Call:
## lm(formula = salary ~ sex, data = salary, subset = (rank == "Prof"))
##
## Coefficients:
## (Intercept) sexFemale
## 29872 -1067
比较三个回归系数值,发现不同 Rank 中,性别的系数差异仍然较大,所以我们不认为上述事实成立。我们依旧认为性别对工资有显著影响,这是排除了职位之后的论断。
> model.full2=lm(salary~sex+rank+year+degree+ysdeg,data=salary)
> model.null2=lm(salary~year+degree+ysdeg,data=salary)
> anova(model.null2,model.full2)
## Analysis of Variance Table
##
## Model 1: salary ~ year + degree + ysdeg
## Model 2: salary ~ sex + rank + year + degree + ysdeg
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 48 672102002
## 2 45 258858365 3 413243637 23.946 2.053e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
p值很小,检验结果表示原假设应被拒绝,即 Sex 和 Rank 有显著影响。
> n=1e5
> x=runif(n,-1,1)
> y=runif(n,-1,1)
> m=sum(x^2+y^2<=1)
> p=m/n
> S=4*p
> cat("单位圆面积:",S)
## 单位圆面积: 3.1352
> x=rnorm(n)
> m=mean(sqrt(abs(x)))
> var=var(sqrt(abs(x)))
> cat("mean=",m,"\n","var=",var)
## mean= 0.8231982
## var= 0.1225058
> hist(x)
> x=rnorm(n)
> y=sqrt(2*pi)*exp(-x^4+x^2/2)
> mean(y)
## [1] 1.813786
> #初始想法
> n=1e5
> x=rnorm(n)
> sum(x[x>5^2])/n
## [1] 0
> #改进
> x=rnorm(n)+5
> h=x^2*exp(-x^2/2+(x-5)^2/2)
> sum(h[which(x>5)])/n
## [1] 7.764954e-06
> a=2;b=1;sigma=1
> n=100
> x=c(rep(1,n/2),rep(0,n/2))
> z=(1:n)/n
>
> N=1e4
> B.hat=B.tilde=NULL
> for (k in 1:N){
+ epsilon=rnorm(n)*sigma
+ y=a+b*x+epsilon #模拟数据生成
+ # 计算两个估计
+ x.bar=mean(x);z.bar=mean(z)
+ b.hat=sum((x-x.bar)*y)/sum((x-x.bar)^2)
+ b.tilde=sum((z-z.bar)*y)/sum((x-x.bar)*(z-z.bar))
+ B.hat=c(B.hat,b.hat)
+ B.tilde=c(B.tilde,b.tilde)
+ }#simulation
> cat("mean of B.hat:",mean(B.hat),"mean of B.tilde:",mean(B.tilde),"\n","var.true=",sigma^2/sum((x-x.bar)^2),"\nvar of B.hat:",var(B.hat),"\nvar of B.tilde: ",var(B.tilde))
## mean of B.hat: 0.9990877 mean of B.tilde: 0.9985881
## var.true= 0.04
## var of B.hat: 0.03827255
## var of B.tilde: 0.05116159
可以看出 \(\tilde{b}\) 依然是无偏估计,但是方差比LS估计大。
> df=read.table("http://staff.ustc.edu.cn/~ynyang/2022/lab/cp.txt",head=TRUE)
> plot(df$x,df$y)
> weights_df=c(rep(1,times=100),rep(1/4,times=100))
> lm( y~x ,data=df,weights=weights_df)#加权LS估计
##
## Call:
## lm(formula = y ~ x, data = df, weights = weights_df)
##
## Coefficients:
## (Intercept) x
## 3.995 1.931
转变点检测
算法思想:设断点为 \(m\), 损失函数定义为: \[ loss(m)=m\log(\sigma^2)+(n-m)\log(\tau^2)+\Sigma_{i=1}^m\frac{(y_i-x_i^T\beta)^2}{\sigma^2}+\Sigma_{i=m+1}^n\frac{(y_i-x_i^T\beta)^2}{\tau^2} \] 对每个 \(m\),都可以用IRLS迭代求得最优的回归系数和方差的估计,代入求得\(loss(m)\), 然后选取使loss最小的m作为断点的估计值。
> x=df$x
> y=df$y
> X=cbind(rep(1,times=200),x)
> n=length(df$x)
> epsilon=1e-10
>
> fit=lm( y~x ,data=df)#OLS估计
> beta_0=fit$coefficients
> resid=fit$residuals #残差
>
> m_best=2
> sigma2=sum(resid^2)/(n-1)
> loss_min=sum(resid^2)/sigma2+n*log(sigma2)
> for(m in 2:198){
+ k=0
+ m1=m+1#用于分割
+ while(k<100){#IRLS
+ sigma2=sum(resid[1:m]^2)/m#前一部分方差估计
+ tau2=sum(resid[m1:n]^2)/(n-m)#后一部分方差估计
+ G=diag(c(rep(sigma2,times=m),rep(tau2,times=n-m)))#方差矩阵估计
+
+ beta=solve(t(X)%*%solve(G)%*%X)%*%t(X)%*%solve(G)%*%y #系数估计
+
+
+ diff=sum((beta-beta_0)^2)
+ beta_0=beta
+ resid=y-X%*%beta#更新残差
+
+ if(diff<epsilon) {break} #终止条件
+ k=k+1
+ }#while
+ loss=sum(resid[1:m]^2)/sigma2+sum(resid[m1:n]^2)/tau2+m*log(sigma2)+(n-m)*log(tau2)
+ #if(is.nan(loss)) {print("error");break}
+ if(loss<loss_min){
+ loss_min=loss
+ m_best=m
+ }#if
+ }#for
> cat("a=",beta[1],"b=",beta[2],"\n","sigma2=",sigma2,"tau2",tau2,"\n","m=",m_best)
## a= 3.852823 b= 2.346893
## sigma2= 1.596252 tau2 0.08910093
## m= 100
> library(alr4)
> df=brains
> fit=lm(log(BrainWt)~log(BodyWt),data=df)
> print(fit$coefficients)#最小二乘估计
## (Intercept) log(BodyWt)
## 2.1347883 0.7516861
> x=log(df$BodyWt)
> err=fit$residuals
> plot(x,err)
从图中可以看出,残差基本均匀分布在0上下,无非线性趋势。但是方差齐性假设不太合理,不同x对应的方差可能不同。
> y=df$BrainWt
> y=log(y)
> y[14]=100
> fit_err=lm(y~x)
> print(fit_err$coefficients)
## (Intercept) x
## 5.3004745 -0.3855299
与前者结果差别很大。
最小一乘法:
> X=cbind(rep(1,times=length(x)),x)
> beta_0=fit_err$coefficients
> err=fit_err$residuals
> G=1/abs(err)
> G=diag(G)
>
> k=0
> while (k<1e2) {#最小一乘法迭代
+ beta=solve(t(X)%*%G%*%X)%*%t(X)%*%G%*%y
+ diff=sum((beta-beta_0)^2)
+ if(diff<1e-5) {cat("iteration:",k,"\n","beta=",beta);break}
+ err=abs(y-X%*%beta)
+ err[err<0.0001]=0.0001
+ err=c(err)
+ G=diag(1/err)
+ beta_0=beta
+
+ k=k+1
+ }
## iteration: 16
## beta= 2.094453 0.7447085
> plot(x,y)
> abline(fit_err)
> abline(a=beta[1],b=beta[2])
结论:错误记录对于最小二乘影响很大,对最小一乘影响较小!
因此我们验证了最小一乘方法比最小二乘法稳健,它对异常的响应值不太敏感。