library(tree) data(iris) (iris.tr<-tree(Species~.,data=iris)) plot(iris.tr,type="u"); text(iris.tr) ### (iris.tr1<-snip.tree(iris.tr,nodes=c(12,7))) plot(iris.tr1,type="u");text(iris.tr1) ### library(tree) iris.label<-c("S", "C", "V")[iris[, 5]] plot(iris[,3],iris[,4],type="n") text(iris[,3],iris[,4],labels=iris.label) partition.tree(iris.tr1,add=T,col=2,cex=1.5) ### library(tree) data(cars) cars.tr<-tree(dist~speed,data=cars) print(cars.tr) plot(cars.tr,type="u") text(cars.tr) plot(cars.tr,type="u") text(cars.tr) ### plot(cars$speed,cars$dist) partition.tree(cars.tr,add=T,col=2) ### (cars.tr1<-prune.tree(cars.tr,best=4)) plot(cars.tr1); text(cars.tr1,all=T) ### plot(cars$speed,cars$dist) partition.tree(cars.tr1,add=T,col=2) ### setwd("D:/R/Sample") playTennis <- read.csv("08PlayTennis.csv", header=T) (playTennis.tr<-tree(Play~.,data=playTennis)) plot(playTennis.tr); text(playTennis.tr) ### library(rpart) setwd("D:/R/Sample") playTennis <- read.csv("08PlayTennis.csv", header=T) (playTennis.tr <- rpart(Play~ ., playTennis ) ) plot(playTennis.tr); text(playTennis.tr) ### library(rpart) setwd("D:/R/Sample") playTennis <- read.csv("08PlayTennis.csv", header=T) (playTennis.tr <- rpart(Play~ ., playTennis, control=rpart.control(minsplit=1)) ) plot(playTennis.tr); text(playTennis.tr) playTennisTest02 <- read.csv("08PlayTennisTest02.csv",header=TRUE) predict(playTennis.tr, playTennisTest02) playTennisTest02 predict(playTennis.tr, playTennisTest02, type="prob") predict(playTennis.tr, PlayTennisTest02, type="matrix") #################################### ### cross validation library(rpart) setwd("D:/R/Sample") iris <- read.csv("07iris.csv", header=T) iris.tr <- rpart( class ~ . , data=iris, method="class" ) print( iris.tr ) # get the learning error pred <- predict( iris.tr, newdata=iris, type="class") cm <- table( iris$class, pred) print( cm ) err.iris.tr <- 1 - sum(diag(cm))/sum(cm) print( err.iris.tr ) # K-fold cross validation n <- nrow( iris ) # number of samples K <- 10 # number of folds size <- n %/% K # size of bins is "size" or "size"+1) set.seed(7) # this affects division of samples into c.v. bins rnk <- rank( runif( n ) ) # this gives a permutation of 1 to n block <- ( rnk - 1 ) %/% size + 1 # converts rnk to group numbers head( block ) # just for check all.err <- NULL all.pred <- NULL for ( k in 1:K ) { # learn a tree without the k-th group iris.tr <- rpart( class ~ . , iris[ block != k, ], method="class", control=rpart.control(minsplit=30) ) # predict with the k-th group pred <- predict( iris.tr, iris[ block==k, ], type="class" ) # confusion matrix cm <- table( iris$class[ block==k ], pred) # error rate = 1 - accuracy err <- 1 - sum(diag(cm))/sum(cm) # each error rate is combined all.err <- rbind( all.err, err ) # prediction of each fold is combined with the sample ids all.pred <- rbind( all.pred, data.frame( id=(1:n)[ block==k ], pred=pred) ) # use this if you want to inspect the current environment # browser() } print( as.numeric( all.err ) ) # to get the average, each error rate should be weighted. ave.err <- t( all.err ) %*% as.numeric( table(block) ) / n print( as.numeric( ave.err ) ) # all the predictions are sorted according to the sample id all.pred.sorted <- all.pred[ sort.int( all.pred$id, index.return=T )$ix, ] ( all.cm <- table( iris$class, all.pred.sorted$pred ) ) # 1 - sum(diag(all.cm))/sum(all.cm) is equal to # ave.err ### library(rpart) setwd("D:/R/Sample") playTennis <- read.csv("07PlayTennis02.csv", header=T) (playTennis.tr <- rpart(Play~ ., playTennis, control=rpart.control(minsplit=1)) ) plot(playTennis.tr); text(playTennis.tr) ### library(rpart) setwd("D:/R/Sample") playTennis <- read.csv("07PlayTennis02.csv", header=T) (playTennis.tr <- rpart(Play~ ., playTennis, control=rpart.control(minsplit=3)) ) plot(playTennis.tr); text(playTennis.tr) ### R で試す決定木のCV library(rpart) setwd("D:/R/Sample") xy <- read.csv("07PlayTennis02.csv", header=T) library(bootstrap) # crossval を使用する theta.fit <- function (x,y) { tmp <- data.frame( Outlook=x[,1], Temperature=x[,2], Humidity=x[,3], Windy=x[,4], class=y) return( rpart(class~., tmp, control=rpart.control(minsplit=1) ) ) } theta.predict <- function( fit, x) {predict( fit, data.frame(x), type="class" ) } results <- crossval(xy[,-5], xy[,5], theta.fit, theta.predict, ngroup=7) (cm <- table( xy[,5], results$cv.fit )) (accuracy10CV <- sum(diag(cm))/sum(cm)) ############### ### 過学習 # データ生成 library(rpart) n.train <- 200 n.test <- 200 sd <- 1.5 train1 <- data.frame( x=rnorm(n.train, mean=1,sd=sd), y=rnorm(n.train, mean=1,sd=sd), c=rep(as.factor(1),n.train) ) train2 <- data.frame( x=rnorm(n.train, mean=-1,sd=sd), y=rnorm(n.train, mean=-1,sd=sd), c=rep(as.factor(2),n.train) ) train <- rbind(train1, train2) test1 <- data.frame( x=rnorm(n.test, mean=1,sd=sd), y=rnorm(n.test, mean=1,sd=sd), c=rep(as.factor(1),n.test) ) test2 <- data.frame( x=rnorm(n.test, mean=-1,sd=sd), y=rnorm(n.test, mean=-1,sd=sd), c=rep(as.factor(2),n.test) ) test <- rbind(test1, test2) plot( subset(train, c==1)[,1:2], xlim=c(-5,5), ylim=c(-5,5) ) points( subset(train, c==2)[,1:2], col="red", xlab="", ylab="" ) ### 一様分布と正規分布の重なり set.seed(2015) n.train <- 100 n.test <- 100 train1.x <- runif(n.train) train1.y <- runif(n.train) train1.c <- rep(as.factor(1),n.train) train1 <- data.frame(x=train1.x, y=train1.y, c=train1.c) train2.x <- rnorm(n.train,0.5,0.1) train2.y <- rnorm(n.train,0.5,0.1) train2.c <- rep(as.factor(2),n.train) train2 <- data.frame(x=train2.x, y=train2.y, c=train2.c) train <- rbind(train1, train2) # plot( subset(train, c==1)[,1:2], xlim=c(0,1), ylim=c(0,1)) # points( subset(train, c==2)[,1:2], col="red", xlab="", ylab="") test1 <- data.frame(x=runif(n.test), y=runif(n.test), c=rep(as.factor(1),n.test)) test2 <- data.frame(x=rnorm(n.test,0.5,0.1), y=rnorm(n.test,0.5,0.1), c=rep(as.factor(2),n.test)) test <- rbind(test1, test2) # plot( subset(test, c==1)[,1:2], xlim=c(0,1), ylim=c(0,1)) # points( subset(test, c==2)[,1:2], col="red", xlab="", ylab="") # 測定 res <- c() for (cp in c(0.7,0.6,0.5,0.4,0.3,0.2,0.1,0.07,0.05,0.03,0.01,0.007,0.005,0.003,0.001)) { t <- rpart(c~ .,train, control=rpart.control(minsplit=3, cp=cp)) tbl.train <- table( train$c, predict(t, train, type="class") ) err.train <- 1 - sum(diag(tbl.train))/sum(tbl.train) tbl.test <- table( test$c, predict(t, test, type="class") ) err.test <- 1 - sum(diag(tbl.test))/sum(tbl.test) print( c(cp, err.train, err.test) ) res <- rbind(res, c(cp, err.train, err.test) ) } dev.new() plot(res[,1:2],log="x", type="l",xlim=c(max(res[,1]),min(res[,1])), ylim=c(0,1.0), xlab="cp in rpart",ylab="err.train & err.test") points(res[,c(1,3)],type="l",col="red",xlab="",ylab="")