Algorithm_Classification
library(data.table)
library(text2vec)
library(class)
library(caret)
train=fread(“train.dat”)
test = fread(“test.dat”, sep2 = “\n”, sep = “\t”, header = FALSE)
colnames(train)=c(“rating”,”review”)
colnames(test)=c(“review”)
test$rating = 0
train_test = rbind(train,test)
train_test$review<- gsub(“<[^>]+>”, “”, train_test$review)
train_test$review<- gsub(‘b”|b\’|\n|\\\\|\\”‘, “”, train_test$review)
train_test$review<- gsub(“([<>])|[[:punct:]]”, “\\1”, train_test$review)
stop_words = c(“the”,”to”,”of”,”for”,”br”,”this”,”for”, “in”,”a”,”b”,”and”,”on”,”is”,”by”,”that”,”with”,”from”,”as”,”it”,”are”,”have”,”be”,”us”,”an”,”was”,”u”,”i”)
train_test_tokens = itoken(train_test$review,
preprocessor = tolower,
tokenizer = word_tokenizer,
progressbar = TRUE)
vocab = create_vocabulary(train_test_tokens,stopwords = stop_words)
pruned_vocab = prune_vocabulary(vocab,term_count_min = 10, doc_proportion_max = 0.95, doc_proportion_min = 0.001, vocab_term_max = 1000)
vectorizer = vocab_vectorizer(pruned_vocab)
dtm_train_test = create_dtm(train_test_tokens, vectorizer)
train_test1=cbind(train_test, data.frame(as.matrix(dtm_train_test)))
train_test1[,review:=NULL]
train = subset(train_test1, rating != 0)
test = subset(train_test1, rating == 0)
test[,rating:=NULL]
train$rating = as.factor(train$rating)
knn_fit = train(rating ~., data = train, method = “knn”)
pred = predict(knn_fit, test, type = “raw”)
pred1 = as.numeric(pred)
pred2 = ifelse(pred1 == 1,-1,1)
pred2 = as.data.table(pred2)
names(pred2)[1] = “V1″
write.table(pred2,”output.dat”,row.names=FALSE)