> install.packages(c("corrplot","alr4")) # 安装
library(corrplot)
## corrplot 0.92 loaded
> cor(state.x77)
## Population Income Illiteracy Life Exp Murder
## Population 1.00000000 0.2082276 0.10762237 -0.06805195 0.3436428
## Income 0.20822756 1.0000000 -0.43707519 0.34025534 -0.2300776
## Illiteracy 0.10762237 -0.4370752 1.00000000 -0.58847793 0.7029752
## Life Exp -0.06805195 0.3402553 -0.58847793 1.00000000 -0.7808458
## Murder 0.34364275 -0.2300776 0.70297520 -0.78084575 1.0000000
## HS Grad -0.09848975 0.6199323 -0.65718861 0.58221620 -0.4879710
## Frost -0.33215245 0.2262822 -0.67194697 0.26206801 -0.5388834
## Area 0.02254384 0.3633154 0.07726113 -0.10733194 0.2283902
## HS Grad Frost Area
## Population -0.09848975 -0.3321525 0.02254384
## Income 0.61993232 0.2262822 0.36331544
## Illiteracy -0.65718861 -0.6719470 0.07726113
## Life Exp 0.58221620 0.2620680 -0.10733194
## Murder -0.48797102 -0.5388834 0.22839021
## HS Grad 1.00000000 0.3667797 0.33354187
## Frost 0.36677970 1.0000000 0.05922910
## Area 0.33354187 0.0592291 1.00000000
> R=cor(state.x77) # Pearson 相关系数
> corrplot(R,diag=F)# 主对角线上不显示
> r=R["Murder","Frost"] # get the cor
> cor.test(state.x77[,"Murder"],state.x77[,"Frost"]) # or
##
## Pearson's product-moment correlation
##
## data: state.x77[, "Murder"] and state.x77[, "Frost"]
## t = -4.4321, df = 48, p-value = 5.405e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.7106377 -0.3065115
## sample estimates:
## cor
## -0.5388834
> cor.test(~Murder+Frost,data=state.x77)
##
## Pearson's product-moment correlation
##
## data: Murder and Frost
## t = -4.4321, df = 48, p-value = 5.405e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.7106377 -0.3065115
## sample estimates:
## cor
## -0.5388834
手工计算大样本检验
> r=R["Murder","Frost"]
> z=sqrt(50-2)*r
> pvalue=2*(1-pnorm(abs(z)))
> pvalue
## [1] 0.0001888417
> x=state.x77[,"Frost"]
> y=state.x77[,"Murder"]
> n=length(x)
> #r0=cor(x,y)
> t0=sum(x*y)-n*mean(x)*mean(y)
> #R_per=NULL
> t_per=NULL
> N=1000000
> for(i in 1:N){
+ x_per=sample(x) # 置换x
+ t_per[i]=sum(x_per*y)-n*mean(x)*mean(y)
+ # R_per[i]=cor(x_per,y)
+ }
> #p1=mean(abs(R_per)>=abs(r0))
> p2=mean(abs(t_per)>=abs(t0))
> print(p2)
## [1] 6.9e-05
随机产生数据,检查t-检验与置换检验的结果(p值)几乎相同
> n=20
> x=rnorm(n)
> y=rnorm(n)
> r0=cor(x,y)
> R_per=NULL
> N=100000
> for(i in 1:N){
+ x_per=sample(x)
+ R_per[i]=cor(x_per,y)
+ }
> pvalue.per=mean(abs(R_per)>=abs(r0))
> cor.test(x,y)->tmp
> pvalue.test=tmp$p.value
> pvalue.test
## [1] 0.3140216
> pvalue.per
## [1] 0.31705
Spearman’s rho 检验
> x=c(2,-2,-11,3,4)
> y=c(0,-1,-3,99,7)
> rankx=rank(x)
> ranky=rank(y)
> person=cor(x,y)
> spearman=cor(rankx,ranky)
> cor.test(x,y,method="spearman")
##
## Spearman's rank correlation rho
##
## data: x and y
## S = 2, p-value = 0.08333
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
## rho
## 0.9
上述例子中 Spearman 检验的精确 p 值为 0.08333,它是基于 spearman 系数的原假设下的精确 分布计算得到的,该精确 p 值可用置换检验方法逼近。试分别进行 1000,10000,100000 次置换,按照 公式 (1) 分别计算置换检验 p 值,观察这些 p 值是否接近 0.08333。
> S_per=NULL
> for( N in c(1000,10000,100000) ){
+ for(i in 1:N){
+ x_per=sample(x)
+ S_per[i]=cor(rank(x_per),ranky)
+ }
+ pvalue_per=mean(abs(S_per)>=spearman)
+ print(pvalue_per)
+ }
## [1] 0.093
## [1] 0.0826
## [1] 0.08315
可见随着置换次数增多,结果与精确p值逐渐接近。 # 3. 偏相关系数 ## 3.1 偏相关系数的计算及检验 ## 3.2 偏相关系数矩阵的计算 代码如下:
> Omega=solve(R)
> d=diag(Omega)
> D0.5=diag(1/sqrt(d))
> Rpartial=-D0.5%*%Omega%*%D0.5
> diag(Rpartial)=1
> Rpartial
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] 1.00000000 0.32300027 -0.26148742 0.264128233 0.40899056 -0.24492979
## [2,] 0.32300027 1.00000000 -0.09587116 -0.013763485 -0.04286866 0.34074103
## [3,] -0.26148742 -0.09587116 1.00000000 0.014246098 0.24672522 -0.47770796
## [4,] 0.26412823 -0.01376349 0.01424610 1.000000000 -0.70591759 0.30797597
## [5,] 0.40899056 -0.04286866 0.24672522 -0.705917594 1.00000000 0.08682678
## [6,] -0.24492979 0.34074103 -0.47770796 0.307975970 0.08682678 1.00000000
## [7,] -0.22139493 0.01434030 -0.56010767 -0.270999997 -0.25970424 -0.17991875
## [8,] -0.05966557 0.24354899 0.33078309 -0.006829194 0.23544872 0.42343271
## [,7] [,8]
## [1,] -0.2213949 -0.059665571
## [2,] 0.0143403 0.243548989
## [3,] -0.5601077 0.330783093
## [4,] -0.2710000 -0.006829194
## [5,] -0.2597042 0.235448722
## [6,] -0.1799187 0.423432714
## [7,] 1.0000000 0.280149525
## [8,] 0.2801495 1.000000000
可以将其写成一个函数,输入协方差阵,输出偏相关系数矩阵,如下:
> r2rp=function(R){
+ Omega=solve(R)
+ d=diag(Omega)
+ D0.5=diag(1/sqrt(d))
+ Rp=-D0.5%*%Omega%*%D0.5
+ diag(Rp)=1
+ return(Rp)
+ }
绘出上述数据的高斯图模型:
> library(igraph)
## Warning: 程辑包'igraph'是用R版本4.2.1 来建造的
##
## 载入程辑包:'igraph'
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
> A = abs(Rpartial)> 0.3
> g <- graph.adjacency( A, mode="undirected", diag=FALSE )
> plot.igraph(g, vertex.color="white", vertex.frame.color="black",
+ vertex.label.color="black", vertex.size=40, vertex.label.cex=1.2 )
R 数据集 ability.cov 给出了 112 个儿童的 6 项测试成绩的协方差矩阵,6 个科目分别是 general, picture, blocks , maze, reading, vocab (综、绘画、积木、迷宫、阅读、词汇量)。 1. 检验 picture 和 reading 是否相关 (给出 p 值即可)。
> dat=ability.cov$cov # 协方差矩阵
> D=sqrt(diag(1/diag(dat)))
> dat.cor=D%*%dat%*%D # 样本相关系数矩阵
> pvalue1=2*(1-pnorm(abs(sqrt(112)*dat.cor[2,5])))# 大样本检验 sqrt(n)*r 服从N(0,1)
> print(pvalue1)
## [1] 0.005393952
> Omega=r2rp(dat) #求出偏相关系数阵
> corrplot(Omega,diag=F)
> pvalue2=2*(1-pnorm(abs(sqrt(112)*Omega[2,5])))# 大样本检验 sqrt(n)*r 服从N(0,1)
> print(pvalue2)
## [1] 0.9376338
> library(igraph)
> Rp=Omega
> A=abs(Rp)>0.1
> g<-graph.adjacency(A,mode="undirected",diag=F)
> plot.igraph(g,vertex.color="white",vertex.frame.color="black",
+ vertex.label.color="black", vertex.size=40,vertex.label.cex=1.2)
> n=100000
> x=runif(n,-1,1);y=runif(n,-1,1)
> m=sum(x^2+y^2<=1)
> p=m/n #落在圆里的比例,即圆和正方形面积比
> S=4*p
> print(S)
## [1] 3.14532
> x=rnorm(n)
> mean(sqrt(abs(x)))
## [1] 0.8223936
> var(sqrt(abs(x)))
## [1] 0.1219618
> hist(x)
> hist(sqrt(abs(x)))
> n=30
> N=10000
> rho=0.5
> all.r=NULL
> all.atanhr=NULL
> for(k in 1:N){
+ u=rnorm(n)
+ v=rnorm(n)
+ x=u
+ y=rho*u+sqrt(1-rho^2)*v
+ r=cor(x,y)
+ all.r=c(all.r,r)
+ tmp=atanh(r)
+ all.atanhr=c(all.atanhr,tmp)
+ }
> par(mfrow=c(2,2))
> hist(all.r)
> hist(all.atanhr)
> qqnorm(all.r)
> qqnorm(all.atanhr)
> v=var(all.r)
> print(v)
## [1] 0.02027274
> (1-rho^2)^2/n
## [1] 0.01875
> var(all.atanhr)
## [1] 0.03687699
> 1/n
## [1] 0.03333333
可以看出后两个比较接近,渐进效果好
画散点图看渐进分布的方差关系
> n=100
> N=10000
> var1=NULL
> rho1=NULL
> for(rho in 0:9){
+ i=rho+1
+ rho=rho/10
+ all.r=NULL
+ for(k in 1:N){
+ u=rnorm(n)
+ v=rnorm(n)
+ x=u
+ y=rho*u+sqrt(1-rho^2)*v
+ r=cor(x,y)
+ all.r=c(all.r,r)
+ }
+ var1=c(var1,var(all.r))
+ rho1=c(rho1,rho)
+ }
> plot(rho1,var1)
这时难以看出函数关系。 线性化以后可以看出关系:\(v(\rho)=(1-\rho^2)^2/n\).
> plot(rho1^2,sqrt(n*var1))