## ----eval=FALSE--------------------------------------------------------------- # options(width=100L) # fn <- "dbpedia_csv.tar.gz" # # if ( !file.exists(fn) ) { # download.file("https://github.com/le-scientifique/torchDatasets/raw/master/dbpedia_csv.tar.gz", # fn) # untar(fn) # } ## ----eval=FALSE--------------------------------------------------------------- # library("fastTextR") # # train <- sample(sprintf("__label__%s", readLines("dbpedia_csv/train.csv"))) # head(train, 2) ## ----eval=FALSE--------------------------------------------------------------- # train <- ft_normalize(train) # writeLines(train, con = "dbpedia.train") # # test <- readLines("dbpedia_csv/test.csv") # labels <- trimws(gsub(",.*", "", test)) # table(labels) ## ----eval=FALSE--------------------------------------------------------------- # test <- ft_normalize(test) # test <- trimws(sub(".*?,", "", test)) # head(test, 2) ## ----eval=FALSE--------------------------------------------------------------- # cntrl <- ft_control(word_vec_size = 10L, learning_rate = 0.1, max_len_ngram = 2L, # min_count = 1L, nbuckets = 10000000L, epoch = 5L, nthreads = 4L) # # model <- ft_train(file = "dbpedia.train", method = "supervised", control = cntrl) # ft_save(model, "dbpedia.bin") ## ----eval=FALSE--------------------------------------------------------------- # model <- ft_load("dbpedia.bin") ## ----eval=FALSE--------------------------------------------------------------- # test_pred <- ft_predict(model, newdata=test, k = 1L) # str(test_pred) ## ----eval=FALSE--------------------------------------------------------------- # confusion_matrix <- table(truth=as.integer(labels), # predicted=as.integer(gsub("\\D", "", test_pred$label))) # print(confusion_matrix) ## ----eval=FALSE--------------------------------------------------------------- # accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix) # print(sprintf("Accuracy: %0.4f", accuracy))