Model Selection 模型选择

目的：使用不同的统计标准，选择模型。

涉及程序包：{leaps}

Part 1: 常用标准

1，Cp, BIC, AIC, R^2, adj R^2, PRESS

a) 需要了解所有可能的组合

require{leaps}

regsubsets(x=, nbest=, nvmax=, method=c(“”) ) .........................不同变量的组合分析

ss<-summary(regsubsets)

names(ss) ..............................................................................查看对象总结可以看到什么

plot(x,lables=regsubsets$xnames, scal=c(“bic”, “Cp”, “adjr2”, “r2”) )

函数结果包含了Cp, BIC, adjust-R^2

举例说明：

require(leaps)

result<-regsubsets(x=X1, y=y, nbest=1000, nvmax=12, method=c("exhaustive"), really.big=T)

xx<-summary(result)

###best sebset using cp,bic

xx$which[which.min(xx$cp),]

xx$which[which.min(xx$bic),]

b)只需找到最好的的组合

leaps(x=, y=, wt=rep(1, NROW(x)), int=TRUE, method=c("Cp", "adjr2", "r2"), nbest=10, names=NULL, df=NROW(x), strictly.compatible=TRUE)

..................此方法是用了exhaustive，穷举的方法

2.Extra Sum of Test, F-test

anova(model)

变量顺序很重要

Fstat= [Sum of extra residules/(extra degree of freedom)] / MSE of full model

3. Regulation using Lasso or Ridge, SCAD

当 p >> n

Part 2: 常用算法

1） forward, backward, stepwise.

2） exhaustive 穷举法

使用的code就在Part 1 a) 中的method里改参数即可

install.packages('leaps')

library(leaps)

leaps<-regsubsets(y~x1+x2+x3+x4+x5,data=dat,nbest=10) #nbest means the max number of optimal model for each size

# view results

summary(leaps)

# plot a table of models showing variables in each model.

# models are ordered by the selection statistic.

plot(leaps,scale="r2")

plot(leaps,scale="Cp")

plot(leaps,scale="adjr2")

plot(leaps,scale="bic")

这里是使用其他package

step(null, scope=list(upper=full, lower=null), direction='both', trace=TRUE)

step(null, scope=list(upper=full, lower=null), direction='forward', trace=TRUE)

step(full, scope=list(upper=full, lower=null), direction='backward', trace=TRUE)

add1 (null, ~x2+x3+x4+x5)

drop1 (full)

add1 (null, ~x2+x3+x4+x5,test='F')

drop1 (full,test='F')

fit = bestglm(Xy,IC='LOOCV')

fit$Subsets

fit = bestglm(Xy,IC='AIC')

fit$Subsets

fit = bestglm(Xy,IC='BIC')

fit$Subsets

手写算法：

bestsubset <- function(X, y){

P = ncol(X)

subsets = expand.grid( rep( list( 0:1), P))

names(subsets)=paste('X',1:P,sep='')

stat = NULL

SST = sum((y-mean(y))^2)

fitall = lm(y~X)

n = nrow(X)

MSEP = deviance(fitall)/(n-P-1)

for(i in 1:nrow(subsets)){

subs = which(subsets[i,]>0)

if(length(subs)==0) fit = lm(y~1)

else {

subX = X[,which(subsets[i,]>0)]

fit = lm(y~subX)

}

p = length(subs)+1

SSE = deviance(fit)

R2 = 1-SSE/SST

R2a = 1-SSE/SST*(n-1)/(n-p)

Cp = SSE/MSEP - (n-2*p)

AIC = n*log(SSE)-n*log(n)+2*p

BIC = n*log(SSE)-n*log(n)+log(n)*p

X1 = as.matrix(cbind(1,X[,subs]))

hatMat = X1%*%solve(t(X1)%*%X1)%*%t(X1)

eList = fit$residuals

dList = eList/(1-diag(hatMat))

PRESS = sum(dList^2)

criList = c(length(subs)+1, subsets[i,], SSE, R2, R2a, Cp, AIC, BIC, PRESS)

stat=rbind(stat,criList)

}

rownames(stat)=NULL

colnames(stat)=c('p',names(subsets),'SSE','R2','R2a','Cp','AIC','BIC','PRESS')

model = NULL

model$R2 = which.max(stat[,P+3])

model$R2a = which.max(stat[,P+4])

model$Cp = which.min(stat[,P+5])

model$AIC = which.min(stat[,P+6])

model$BIC = which.min(stat[,P+7])

model$PRESS = which.min(stat[,P+8])

list(model=model, stat=stat)

}

3) Anova()

注意线性回归和GLM（如逻辑斯特回归）的不一样。

Part 3: 特别的计算机实现：

Cross Validation

Wednesday, April 2, 2014

next >

< previous