library(nls2) x<-1:20 y <- x+rnorm(20) plot(x,y) xy=data.frame(x=x,y=y) res <- nls(y ~ a + b *x + c*x^2 + d *x^3, data=xy, start=list(a=1,b=1,c=0.5,d=0.1)) curve((x),col=4,add=T) lines(x,predict(res),col=2) x<-1:20 y <- x+rnorm(20,sd=3) plot(x,y) xy=data.frame(x=x,y=y) res <- nls(y ~ a + b *x + c*x^2 + d *x^3 + e *x^4 + f*x^5, data=xy, start=list(a=1,b=1,c=0.5,d=0.1,e=0.05,f=0.001)) curve((x),col=4,add=T) lines(x,predict(res),col=2) # 多項式近似 set.seed(1234) x<-1:20 y <- x+rnorm(20,sd=3) plot(x,y) xy <- data.frame(x=x,y=y) res5 <- nls(y ~ a + b *x + c*x^2 + d *x^3 + e *x^4 + f*x^5, data=xy, start=list(a=1,b=1,c=0.5,d=0.1,e=0.05,f=0.001)) curve((x),col=4,add=T) # 青 lines(x,predict(res5),col=2) # 赤 xy=data.frame(x=x,y=y) res1 <- nls(y ~ a + b *x , data=xy, start=list(a=1,b=1)) lines(x,predict(res1),col=3) # 緑 # 学習誤差 mean( (y-predict(res5) )^2) mean( (y-predict(res1) )^2) # 汎化誤差 mean( (x-predict(res5) )^2) mean( (x-predict(res1) )^2) res10 <- nls(y ~ a + b *x + c*x^2 + d *x^3 + e *x^4 + f*x^6 + g*x^7 + h*x^8 + i*x^9 + j*x^10, data=xy, start=list(a=1,b=1,c=0.5,d=0.1,e=0.05,f=0.001,g=0.001, h=0.001, i=0.001, j=0.0001),control=nls.control(warnOnly=TRUE)) lines(x,predict(res10),col=4) ## 決定木 library(rpart) setwd("D:/R/Sample") playTennis <- read.csv("07PlayTennis02.csv", header=T) (playTennis.tr <- rpart(Play~ ., playTennis, control=rpart.control(minsplit=1)) ) plot(playTennis.tr); text(playTennis.tr,cex=1.2) ## 決定木 iris library(rpart) setwd("D:/R/Sample") iris <- read.csv("07iris.csv", header=T) (iris.tr <- rpart(class~ ., iris, control=rpart.control(minsplit=1)) ) plot(iris.tr); text(iris.tr) library(rpart) setwd("D:/R/Sample") iris <- read.csv("07iris.csv", header=T) (iris.tr <- rpart(class~ ., iris, control=rpart.control(minsplit=30)) ) plot(iris.tr); text(iris.tr) # 過学習の数値例 library(rpart) n.train <- 200 n.test <- 200 sd <- 1.5 train1 <- data.frame(x=rnorm(n.train, mean=1,sd=sd), y=rnorm(n.train, mean=1,sd=sd), c=rep(as.factor(1),n.train)) train2 <- data.frame(x=rnorm(n.train, mean=-1,sd=sd), y=rnorm(n.train, mean=-1,sd=sd), c=rep(as.factor(2),n.train)) train <- rbind(train1, train2) test1 <- data.frame(x=rnorm(n.test, mean=1,sd=sd), y=rnorm(n.test, mean=1,sd=sd), c=rep(as.factor(1),n.test)) test2 <- data.frame(x=rnorm(n.test, mean=-1,sd=sd), y=rnorm(n.test, mean=-1,sd=sd), c=rep(as.factor(2),n.test)) test <- rbind(test1, test2) plot( subset(train, c==1)[,1:2], xlim=c(-5,5), ylim=c(-5,5)) points( subset(train, c==2)[,1:2], col="red", xlab="", ylab="") res <- c() for (cp in c(0.7,0.6,0.5,0.4,0.3,0.2,0.1,0.07,0.05,0.03,0.01,0.007,0.005,0.003,0.001)) { t <- rpart(c~ .,train, control=rpart.control(minsplit=3, cp=cp)) tbl.train <- table( train$c, predict(t, train, type="class") ) err.train <- 1 - sum(diag(tbl.train))/sum(tbl.train) tbl.test <- table( test$c, predict(t, test, type="class") ) err.test <- 1 - sum(diag(tbl.test))/sum(tbl.test) print( c(cp, err.train, err.test) ) res <- rbind(res, c(cp, err.train, err.test) ) } plot(res[,1:2],log="x", type="l",xlim=c(max(res[,1]),min(res[,1])), ylim=c(0,1.0), xlab="cp in rpart",ylab="err.train & err.test") points(res[,c(1,3)],type="l",col="red",xlab="",ylab="") # あるデータ: 分類器はできたが(rpart) library(rpart) set.seed(2012) n.train <- 100 n.test <- 100 train1.x <- runif(n.train) train1.y <- runif(n.train) train1.c <- rep(as.factor(1),n.train) train1 <- data.frame(x=train1.x, y=train1.y, c=train1.c) train2.x <- runif(n.train) train2.y <- runif(n.train) train2.c <- rep(as.factor(2),n.train) train2 <- data.frame(x=train2.x, y=train2.y, c=train2.c) train <- rbind(train1, train2) plot( subset(train, c==1)[,1:2], xlim=c(0,1), ylim=c(0,1)) points( subset(train, c==2)[,1:2], col="red", xlab="", ylab="") t <- rpart(c~.,train) plot(t); text(t) test1 <- data.frame(x=runif(n.test), y=runif(n.test), c=rep(as.factor(1),n.test)) test2 <- data.frame(x=runif(n.test), y=runif(n.test), c=rep(as.factor(2),n.test)) test <- rbind(test1, test2) res <- c() for (cp in c(0.03,0.025,0.02,0.015,0.01,0.008,0.005,0.003)) { t <- rpart(c~ .,train, control=rpart.control(minsplit=3, cp=cp)) tbl.train <- table( train$c, predict(t, train, type="class") ) err.train <- 1 - sum(diag(tbl.train))/sum(tbl.train) tbl.test <- table( test$c, predict(t, test, type="class") ) err.test <- 1 - sum(diag(tbl.test))/sum(tbl.test) print(c(cp, err.train, err.test)) res <- rbind(res, c(cp, err.train, err.test) ) } plot(res[,1:2],type="l",xlim=c(max(res[,1]),min(res[,1])), ylim=c(0,1.0), xlab="cp in rpart",ylab="err.test & err.train") points(res[,c(1,3)],type="l",col="red",xlab="",ylab="") # md<- 0.0083 t <- tree(c~.,train, control=tree.control(n.train*2, mindev=md)) plot(t); text(t) (tbl.train <- table( train$c, predict(t, train, type="class") )) sum(diag(tbl.train))/sum(tbl.train) (tbl.test <- table( test$c, predict(t, test, type="class") )) sum(diag(tbl.test))/sum(tbl.test) t.label<-c("1", "2")[train[, 3]] t.color<-c("blue", "red")[train[, 3]] plot(train[,1:2],type="n") text(train[,1:2],labels=t.label,col=t.color, cex=0.7) partition.tree(t,add=T,col=1,cex=2) # 数字認識のデータ library(rpart) setwd("D:/R/Sample") dig <- read.csv("05optdigits.tra.csv", header=F, colClasses=c(rep("integer",64),"factor") ) dig.test <- read.csv("05optdigits.tes.csv", header=F, colClasses=c(rep("integer",64),"factor") ) # 数字認識 rpart for (cp in c(0.07,0.05,0.03,0.01,0.003,0.001,0.0003,0.0001,0.00003)) { dig.tr <- rpart(V65~ ., dig, control=rpart.control(minsplit=3, cp=cp)) tbl <- table(dig[,65],predict(dig.tr, dig, type="class")) err.train <- 1 - sum(diag(tbl))/sum(tbl) tbl <- table(dig.test[,65],predict(dig.tr, dig.test, type="class")) err.test <- 1 - sum(diag(tbl))/sum(tbl) print(c(cp, err.train, err.test)) } # ## R の決定木の cross validation (cv): rpart on 07PlayTennis02.csv # #> results <- crossval(playTennis[,-5],playTennis[,5], theta.fit, theta.predict, ngroup=3) # 以下にエラー model.frame.default(Terms, newdata, na.action = na.action, xlev = attr(object, : # factor 'Temperature' has new level(s) Hot library(rpart) setwd("D:/R/Sample") playTennis <- read.csv("07PlayTennis02.csv", header=T) library(bootstrap) # crossval を使用する theta.fit <- function (x,y) { tmp <- data.frame( Outlook=x[,1], Temperature=x[,2], Humidity=x[,3], Windy=x[,4], class=y) return( rpart(class~., tmp, control=rpart.control(minsplit=30) ) ) } theta.predict <- function( fit, x) {predict( fit, data.frame(x), type="class" ) } results <- crossval(playTennis[,-5],playTennis[,5], theta.fit, theta.predict, ngroup=3) (cm <- table( playTennis[,5], results$cv.fit )) (accuracy10CV <- sum(diag(cm))/sum(cm)) # # library(rpart) setwd("D:/R/Sample") playTennis <- read.csv("07PlayTennis02.csv", header=T) xy<-playTennis library(bootstrap) # crossval を使用する theta.fit <- function (x,y) { tmp <- data.frame( Outlook=x[,1], Temperature=x[,2], Humidity=x[,3], Windy=x[,4], class=y) return( rpart(class~., tmp, control=rpart.control(minsplit=2) ) ) } theta.predict <- function( fit, x) {predict( fit, data.frame(x), type="class" ) } results <- crossval(xy[,-5], xy[,5], theta.fit, theta.predict, ngroup=7) cm <- table( xy[,5], results$cv.fit ) (accuracy10CV <- sum(diag(cm))/sum(cm)) # ## ten-fold cross validation for rpart on iris # library(bootstrap) # crossval を使用する theta.fit <- function (x,y) { tmp <- data.frame( sepallength=x[,1], sepalwidth=x[,2], petallength=x[,3], petalwidth=x[,4], class=y) return( rpart(class~., tmp, control=rpart.control(minsplit=30) ) ) } theta.predict <- function( fit, x) {predict( fit, data.frame(x), type="class" ) } results <- crossval(iris[,-5],iris[,5], theta.fit, theta.predict, ngroup=10) (cm <- table( iris[,5], results$cv.fit )) (accuracy10CV <- sum(diag(cm))/sum(cm)) # ## training error when all the training data are used # m <- rpart(class~., iris, control=rpart.control(minsplit=30) ) predicted <- predict( m, iris, type="class" ) correct <- iris[,5] (cm <- table( correct, predicted ) ) (accuracyTraining <- sum(diag(cm))/sum(cm))