Algorithm_Classification

Algorithm_Classification

library(data.table)

library(text2vec)

library(class)

library(caret)

train=fread(“train.dat”)

test = fread(“test.dat”, sep2  = “\n”, sep = “\t”, header = FALSE)

colnames(train)=c(“rating”,”review”)

colnames(test)=c(“review”)

test$rating = 0

train_test = rbind(train,test)

train_test$review<- gsub(“<[^>]+>”, “”, train_test$review)

train_test$review<- gsub(‘b”|b\’|\n|\\\\|\\”‘, “”, train_test$review)

train_test$review<- gsub(“([<>])|[[:punct:]]”, “\\1”, train_test$review)

stop_words = c(“the”,”to”,”of”,”for”,”br”,”this”,”for”, “in”,”a”,”b”,”and”,”on”,”is”,”by”,”that”,”with”,”from”,”as”,”it”,”are”,”have”,”be”,”us”,”an”,”was”,”u”,”i”)

train_test_tokens = itoken(train_test$review,

preprocessor = tolower,

tokenizer = word_tokenizer,

progressbar = TRUE)

vocab = create_vocabulary(train_test_tokens,stopwords = stop_words)

pruned_vocab = prune_vocabulary(vocab,term_count_min = 10, doc_proportion_max = 0.95, doc_proportion_min = 0.001, vocab_term_max = 1000)

vectorizer = vocab_vectorizer(pruned_vocab)

dtm_train_test = create_dtm(train_test_tokens, vectorizer)

train_test1=cbind(train_test, data.frame(as.matrix(dtm_train_test)))

train_test1[,review:=NULL]

train = subset(train_test1, rating != 0)

test = subset(train_test1, rating == 0)

test[,rating:=NULL]

train$rating = as.factor(train$rating)

knn_fit = train(rating ~., data = train, method = “knn”)

pred = predict(knn_fit, test, type = “raw”)

pred1 = as.numeric(pred)

pred2 = ifelse(pred1 == 1,-1,1)

pred2 = as.data.table(pred2)

names(pred2)[1] = “V1″

write.table(pred2,”output.dat”,row.names=FALSE)