library(Hmisc) # for inc # read training data setwd("D:/R/20Newsgroups") train.data<-read.table("train.data") names(train.data)<-c("docNo","word","count") head(train.data) train.label<-read.table("train.label") names(train.label)<-c("cat") cat<-train.label$cat[ train.data$docNo ] train<-data.frame(cat=cat,word=train.data$word,count=train.data$count) head(train) tail(train) # read test data test.data<-read.table("test.data") names(test.data)<-c("docNo","word","count") head(test.data) test.label<-read.table("test.label") names(test.label)<-c("cat") cat<-test.label$cat[ test.data$docNo ] test<-data.frame(cat=cat,docNo=test.data$docNo,word=test.data$word,count=test.data$count) head(test) tail(test) # trainA <- array( 1, dim=c(max(train$cat), max(train$word, test$word) ) ) for ( i in 1:length(train[,1]) ) { inc(trainA[train$cat[i],train$word[i] ] ) <- train$count[i] } catMaxN <- max(train$cat) trainCatN <- as.vector( table(train$cat) ) trainCatProb <- 1/trainCatN trainCatWordN <- apply(trainA,1,sum) trainCatWordProb <- diag( 1/trainCatWordN ) %*% trainA trainCatProbLog <- log(trainCatProb) trainCatWordProbLog <- log(trainCatWordProb) # learning is over # now the tests testByDocNo <- split(test,test$docNo) length(testByDocNo) ## number of test documents length(testByDocNo[[1]]) ## number of different words in the first document probListByCat <- function (docNIndexinTest) unlist( lapply(1:catMaxN, function (c) trainCatProbLog[c] + sum( testByDocNo[[docNIndexinTest]]$count * trainCatWordProbLog[ c, testByDocNo[[docNIndexinTest]]$word ] ) )) predictedCat <- apply( array(1:length(testByDocNo)), 1, function (x) which.max( probListByCat(x) ) ) correctCat <- apply( array(1:length(testByDocNo)), 1, function (x) testByDocNo[[x]]$cat[1] ) compare<-data.frame( predicted=predictedCat, correct=correctCat ) # confusion matrix (cm <- table(compare)) nCorrectDoc <- sum(diag(cm)) nTestDoc <- length(testByDocNo) # (accuracy <- nCorrectDoc/nTestDoc )