# defaultの stats パッケージに含まれている hclustを試してみる
#
# まず、簡単なデータで。
# MASS パッケージに含まれている Animals を使おう
# csv ファイルとして保存してある
setwd("D:/R/Sample")
Animals <- read.csv("11Animals.csv", header=TRUE)

# まずデータの中味を見てみよう
summary(Animals)
# 動物名が一つの列になっている。データとしてはよくある形だが、Rとしては扱いにくい。
# 動物名は「行の名前」にしよう
myAnimals <- Animals[2:3]
rownames(myAnimals) <- Animals$X
# 次の問題は、属性間（といっても２属性しかないが）のスケールの違いである。
# 平均0、分散1に正規化しよう
myAnimals <- scale(myAnimals)
# 次も可: myAnimals <- scale(Animals[-1]); rownames(myAnimals) <- Animals$X

# クラスタリングは距離行列を元にして行われる。
# ユークリッド距離が default. 対角行は 0 なので、作らないことにしよう
myAnimalsDistance <- dist(myAnimals, diag=FALSE)
# 桁数を減らして（数値を丸めて）表示してみよう（といってもあまり変わらないが）
round(myAnimalsDistance, 3)
# まず、default の（?hclust としてみてください） complete で 
Animals.clst.complete <- hclust(myAnimalsDistance)
plot(Animals.clst.complete)
# 次に、平均距離で
Animals.clst.average <- hclust(myAnimalsDistance, method="average")
plot(Animals.clst.average)
# 普通はお勧めの ward で
Animals.clst.ward <- hclust(myAnimalsDistance, method="ward")
plot(Animals.clst.ward)
# 同じか違うか、ちょっと、考えてください。

# iris で試してみましょうか。
# 150 個はお試しには多すぎるので、ランダムに15個選んでみよう
myIris <- iris[ sample.int( dim(iris)[1], 15 ), ]
irisDistance <- dist(myIris[,-length(myIris)], diag=FALSE)
iris.clst <- hclust(irisDistance, method="average")
plot(iris.clst)
# ちょっと少ないか。では50個
myIris <- iris[ sample.int( dim(iris)[1], 50 ), ]
irisDistance <- dist(myIris[,-length(myIris)], diag=FALSE)
iris.clst <- hclust(irisDistance, method="ward")
plot(iris.clst)
# うまく分かれているだろうか？
cc <- rect.hclust(iris.clst, k=3)
myIris[cc[[1]],length(myIris)]
myIris[cc[[2]],length(myIris)]
myIris[cc[[3]],length(myIris)]
table(
c(seq(1,1,length.out=length(cc[[1]])),
  seq(2,2,length.out=length(cc[[2]])),
  seq(3,3,length.out=length(cc[[3]]) ) ),
c(myIris[cc[[1]],length(myIris)],
  myIris[cc[[2]],length(myIris)],
  myIris[cc[[3]],length(myIris)] ) )
# method を全て試してみてください。どれがよさそうですか？
# また全データ（150個）で試してみて下さい。method は任意のものを


# k-means in stats package
#
x <- rbind(matrix(rnorm(100, sd = 0.3), ncol = 2),
           matrix(rnorm(100, mean = 1, sd = 0.3), ncol = 2))
colnames(x) <- c("x", "y")
(cl <- kmeans(x, 2))
plot(x, col = cl$cluster)
points(cl$centers, col = 1:2, pch = 8, cex=2)

x <- rbind(matrix(rnorm(120, sd = 0.3), ncol = 3),
           matrix(rnorm(120, mean = 0.5, sd = 0.3), ncol = 3))
colnames(x) <- c("x", "y", "z")
(cl <- kmeans(x, 2))
plot(x, col = cl$cluster)
points(cl$centers, col = 1:2, pch = 8, cex=2)

plot(x[,2:3], col = cl$cluster)
points(cl$centers[,2:3], col = 1:2, pch = 8, cex=2)

# data.frame にすると一覧の図になります
x <- data.frame(x)
plot(x, col = cl$cluster)


# iris では
x <- iris[,-5]
(cl <- kmeans(x, 3))
plot(x, col = cl$cluster)
points(cl$centers, col = 1:3, pch = 8, cex=2)