library(tree)
data(iris)
(iris.tr<-tree(Species~.,data=iris))
plot(iris.tr,type="u"); text(iris.tr)

###

(iris.tr1<-snip.tree(iris.tr,nodes=c(12,7)))
plot(iris.tr1,type="u");text(iris.tr1)

###

library(tree)
iris.label<-c("S", "C", "V")[iris[, 5]]
plot(iris[,3],iris[,4],type="n")
text(iris[,3],iris[,4],labels=iris.label)
partition.tree(iris.tr1,add=T,col=2,cex=1.5)

###

library(tree)
data(cars)
cars.tr<-tree(dist~speed,data=cars)
print(cars.tr)
plot(cars.tr,type="u")
text(cars.tr)
plot(cars.tr,type="u")
text(cars.tr)

###

plot(cars$speed,cars$dist)
partition.tree(cars.tr,add=T,col=2)

###

(cars.tr1<-prune.tree(cars.tr,best=4))
plot(cars.tr1); text(cars.tr1,all=T)

###

plot(cars$speed,cars$dist)
partition.tree(cars.tr1,add=T,col=2)

###

setwd("D:/R/Sample")
playTennis <- read.csv("08PlayTennis.csv", header=T) 
(playTennis.tr<-tree(Play~.,data=playTennis))

plot(playTennis.tr); text(playTennis.tr)

###

library(rpart)
setwd("D:/R/Sample")
playTennis <- read.csv("08PlayTennis.csv", header=T)
(playTennis.tr <- rpart(Play~ ., playTennis ) )

plot(playTennis.tr); text(playTennis.tr)

###

library(rpart)
setwd("D:/R/Sample")
playTennis <- read.csv("08PlayTennis.csv", header=T)
(playTennis.tr <- rpart(Play~ ., playTennis, control=rpart.control(minsplit=1)) )

plot(playTennis.tr); text(playTennis.tr)

playTennisTest02 <- read.csv("08PlayTennisTest02.csv",header=TRUE)
predict(playTennis.tr, playTennisTest02)

playTennisTest02

predict(playTennis.tr, playTennisTest02, type="prob")

predict(playTennis.tr, PlayTennisTest02, type="matrix")

####################################



### cross validation
library(rpart)

setwd("D:/R/Sample")
iris <- read.csv("07iris.csv", header=T)


iris.tr <- rpart( class ~ . , data=iris, method="class" )
print( iris.tr )

# get the learning error
pred <- predict( iris.tr, newdata=iris, type="class")
cm <- table( iris$class, pred)
print( cm )

err.iris.tr <- 1 - sum(diag(cm))/sum(cm)
print( err.iris.tr )

# K-fold cross validation

n <- nrow( iris )  # number of samples
K <- 10    # number of folds

size <- n %/% K    # size of bins is "size" or "size"+1)
set.seed(7)        # this affects division of samples into c.v. bins
rnk <- rank( runif( n ) )   # this gives a permutation of 1 to n
block <- ( rnk - 1 ) %/% size + 1   # converts rnk to group numbers
head( block )      # just for check

all.err <- NULL
all.pred <- NULL
for ( k in 1:K ) {

  # learn a tree without the k-th group
  iris.tr <- rpart( class ~ . , iris[ block != k, ], method="class",
                    control=rpart.control(minsplit=30)  )
  # predict with the k-th group
  pred <- predict( iris.tr, iris[ block==k, ], type="class" )
  # confusion matrix
  cm <- table( iris$class[ block==k ], pred)
  # error rate = 1 - accuracy
  err <- 1 - sum(diag(cm))/sum(cm)
  # each error rate is combined 
  all.err <- rbind( all.err, err )
  # prediction of each fold is combined with the sample ids
  all.pred <- rbind( all.pred, 
           data.frame( id=(1:n)[ block==k ], pred=pred) )
  # use this if you want to inspect the current environment
  # browser()

}

print( as.numeric( all.err ) )

# to get the average, each error rate should be weighted.
ave.err <- t( all.err ) %*% as.numeric( table(block) ) / n 

print( as.numeric( ave.err ) )

# all the predictions are sorted according to the sample id
all.pred.sorted <- all.pred[ sort.int( all.pred$id, index.return=T )$ix, ]

( all.cm <- table( iris$class, all.pred.sorted$pred ) )
# 1 - sum(diag(all.cm))/sum(all.cm) is equal to
# ave.err





###

library(rpart)
setwd("D:/R/Sample")
playTennis <- read.csv("07PlayTennis02.csv", header=T)
(playTennis.tr <- rpart(Play~ ., playTennis, 
   control=rpart.control(minsplit=1)) )
plot(playTennis.tr); text(playTennis.tr)

###

library(rpart)
setwd("D:/R/Sample")
playTennis <- read.csv("07PlayTennis02.csv", header=T)
(playTennis.tr <- rpart(Play~ ., playTennis, 
   control=rpart.control(minsplit=3)) )
plot(playTennis.tr); text(playTennis.tr)


### R で試す決定木のCV

library(rpart)
setwd("D:/R/Sample")
xy <- read.csv("07PlayTennis02.csv", header=T)

library(bootstrap)               # crossval を使用する
theta.fit <- function (x,y) { 
  tmp <- data.frame( Outlook=x[,1], Temperature=x[,2],
          Humidity=x[,3], Windy=x[,4], class=y)
  return( rpart(class~., tmp, control=rpart.control(minsplit=1) ) )
}
theta.predict <- function( fit, x) {predict( fit, data.frame(x), type="class" ) }
results <- crossval(xy[,-5], xy[,5], theta.fit, theta.predict, ngroup=7) 
(cm <- table( xy[,5], results$cv.fit ))
(accuracy10CV <- sum(diag(cm))/sum(cm))




###############
###　過学習

#  データ生成

library(rpart)

n.train <- 200
n.test <- 200

sd <- 1.5

train1 <- data.frame(
    x=rnorm(n.train, mean=1,sd=sd), y=rnorm(n.train, mean=1,sd=sd),
    c=rep(as.factor(1),n.train)  )
train2 <- data.frame(
    x=rnorm(n.train, mean=-1,sd=sd), y=rnorm(n.train, mean=-1,sd=sd),
    c=rep(as.factor(2),n.train)  )
train <- rbind(train1, train2)

test1 <- data.frame(
    x=rnorm(n.test, mean=1,sd=sd), y=rnorm(n.test, mean=1,sd=sd),
    c=rep(as.factor(1),n.test)  )
test2 <- data.frame(
    x=rnorm(n.test, mean=-1,sd=sd), y=rnorm(n.test, mean=-1,sd=sd),
    c=rep(as.factor(2),n.test)  )
test <- rbind(test1, test2)

plot(   subset(train, c==1)[,1:2], xlim=c(-5,5), ylim=c(-5,5)  )
points( subset(train, c==2)[,1:2], col="red", xlab="", ylab="" )


###  一様分布と正規分布の重なり
set.seed(2015)
n.train <- 100
n.test <- 100
train1.x <- runif(n.train)
train1.y <- runif(n.train)
train1.c <- rep(as.factor(1),n.train)
train1 <- data.frame(x=train1.x, y=train1.y, c=train1.c)

train2.x <- rnorm(n.train,0.5,0.1)
train2.y <- rnorm(n.train,0.5,0.1)
train2.c <- rep(as.factor(2),n.train)
train2 <- data.frame(x=train2.x, y=train2.y, c=train2.c)

train <- rbind(train1, train2)

# plot( subset(train, c==1)[,1:2], xlim=c(0,1), ylim=c(0,1))
# points( subset(train, c==2)[,1:2], col="red", xlab="", ylab="")


test1 <- data.frame(x=runif(n.test), y=runif(n.test), c=rep(as.factor(1),n.test))
test2 <- data.frame(x=rnorm(n.test,0.5,0.1), y=rnorm(n.test,0.5,0.1), c=rep(as.factor(2),n.test))

test <- rbind(test1, test2)

# plot( subset(test, c==1)[,1:2], xlim=c(0,1), ylim=c(0,1))
# points( subset(test, c==2)[,1:2], col="red", xlab="", ylab="")



# 測定

res <- c()
for (cp in c(0.7,0.6,0.5,0.4,0.3,0.2,0.1,0.07,0.05,0.03,0.01,0.007,0.005,0.003,0.001)) {
  t <- rpart(c~ .,train,  control=rpart.control(minsplit=3, cp=cp)) 
  tbl.train <- table( train$c, predict(t, train, type="class") )
  err.train <- 1 - sum(diag(tbl.train))/sum(tbl.train)
  tbl.test <- table( test$c, predict(t, test, type="class") )
  err.test <- 1 - sum(diag(tbl.test))/sum(tbl.test)
  print( c(cp, err.train, err.test) )
  res <- rbind(res, c(cp, err.train, err.test) )
}

dev.new() 

plot(res[,1:2],log="x", type="l",xlim=c(max(res[,1]),min(res[,1])),
  ylim=c(0,1.0), xlab="cp in rpart",ylab="err.train & err.test")
points(res[,c(1,3)],type="l",col="red",xlab="",ylab="")