## ----setup, include=FALSE----------------------------------------------------- knitr::opts_chunk$set(fig.width = 50, fig.height = 50, warning = FALSE, message = FALSE, echo = FALSE, cache = FALSE, eval = FALSE) dir_wili_2018 = '/IN CASE I WANT TO REBUILD THE VIGNETTE MODIFY AND POINT THIS DIR TO THE WILLI 2018 DIRECTORY/wili-2018' dir_files = '/IN CASE I WANT TO REBUILD THE VIGNETTE MODIFY AND POINT THIS DIR TO THE HUMAN DECLARATION FILES/declaration_human_rights' # the other 'dir_files' directory in line 663 is mentioned just in case I download the declaration files inside the 'wili-2018' directory ## ----echo = T----------------------------------------------------------------- # # fasttext_supported_languages = c('af', 'als', 'am', 'an', 'ar', 'arz', 'as', 'ast', 'av', # 'az', 'azb', 'ba', 'bar', 'bcl', 'be', 'bg', 'bh', 'bn', # 'bo', 'bpy', 'br', 'bs', 'bxr', 'ca', 'cbk', 'ce', 'ceb', # 'ckb', 'co', 'cs', 'cv', 'cy', 'da', 'de', 'diq', 'dsb', # 'dty', 'dv', 'el', 'eml', 'en', 'eo', 'es', 'et', 'eu', # 'fa', 'fi', 'fr', 'frr', 'fy', 'ga', 'gd', 'gl', 'gn', # 'gom', 'gu', 'gv', 'he', 'hi', 'hif', 'hr', 'hsb', 'ht', # 'hu', 'hy', 'ia', 'id', 'ie', 'ilo', 'io', 'is', 'it', # 'ja', 'jbo', 'jv', 'ka', 'kk', 'km', 'kn', 'ko', 'krc', # 'ku', 'kv', 'kw', 'ky', 'la', 'lb', 'lez', 'li', 'lmo', # 'lo', 'lrc', 'lt', 'lv', 'mai', 'mg', 'mhr', 'min', 'mk', # 'ml', 'mn', 'mr', 'mrj', 'ms', 'mt', 'mwl', 'my', 'myv', # 'mzn', 'nah', 'nap', 'nds', 'ne', 'new', 'nl', 'nn', 'no', # 'oc', 'or', 'os', 'pa', 'pam', 'pfl', 'pl', 'pms', 'pnb', # 'ps', 'pt', 'qu', 'rm', 'ro', 'ru', 'rue', 'sa', 'sah', # 'sc', 'scn', 'sco', 'sd', 'sh', 'si', 'sk', 'sl', 'so', # 'sq', 'sr', 'su', 'sv', 'sw', 'ta', 'te', 'tg', 'th', 'tk', # 'tl', 'tr', 'tt', 'tyv', 'ug', 'uk', 'ur', 'uz', 'vec', # 'vep', 'vi', 'vls', 'vo', 'wa', 'war', 'wuu', 'xal', 'xmf', # 'yi', 'yo', 'yue', 'zh') ## ----echo = T----------------------------------------------------------------- # # isocodes = ISOcodes::ISO_639_2 # # head(isocodes) # # comp_cases = complete.cases(isocodes$Alpha_2) # isocodes_fasttext = isocodes[comp_cases, ] # # dim(isocodes_fasttext) # # idx_keep_fasttext = which(isocodes_fasttext$Alpha_2 %in% fasttext_supported_languages) # # isocodes_fasttext = isocodes_fasttext[idx_keep_fasttext, ] # isocodes_fasttext = data.table::data.table(isocodes_fasttext) # # isocodes_fasttext # # lower_nams = tolower(isocodes_fasttext$Name) # lower_nams = trimws(as.vector(unlist(lapply(strsplit(lower_nams, "[;, ]"), function(x) x[1]))), which = 'both') # remove second or third naming of the country name # # isocodes_fasttext$Name_tolower = lower_nams # isocodes_fasttext ## ----echo = T----------------------------------------------------------------- # # print_accuracy = function(size_input_data, # true_data, # preds_data, # method) { # # cat(glue::glue("Total Rows: {size_input_data}"), '\n') # rnd_2 = round(length(preds_data)/size_input_data, 4) # msg_2 ="Predicted Rows: {length(preds_data)} ({rnd_2 * 100}% predicted)" # cat(glue::glue(msg_2), '\n') # cat(glue::glue("Missing Values: {size_input_data - length(preds_data)}"), '\n') # rnd_4 = round(sum(true_data == preds_data) / length(preds_data), 4) # msg_4 = "Accuracy on 'Predicted Rows' using '{method}': {rnd_4 * 100}%" # cat(glue::glue(msg_4), '\n') # } # # # fasttext language identification supported languages as described in https://fasttext.cc/docs/en/language-identification.html ## ----------------------------------------------------------------------------- # # list.files(dir_wili_2018) # ## ----------------------------------------------------------------------------- # # pth_x_test = file.path(dir_wili_2018, 'x_test.txt') # wili_test_x = data.table::fread(file = pth_x_test, sep = '\n', stringsAsFactors = F, header = F, nThread = parallel::detectCores()) # # pth_y_test = file.path(dir_wili_2018, 'y_test.txt') # wili_test_y = data.table::fread(file = pth_y_test, sep = '\n', stringsAsFactors = F, header = F, nThread = parallel::detectCores()) # nrow_init = nrow(wili_test_y) # # inters_labels = which(wili_test_y$V1 %in% isocodes_fasttext$Alpha_3_B) # # # subset both the 'x_test' and the 'y_test' data # # wili_test_x = wili_test_x[inters_labels, ] # wili_test_y = wili_test_y[inters_labels, ] # # cat(glue::glue("Initial observations: {nrow_init} Subset based on isocodes: {nrow(wili_test_y)} Number of languages based on subset: {length(unique(wili_test_y$V1))}"), '\n') # # head(wili_test_y) # ## ----echo = T----------------------------------------------------------------- # # file_ftz = system.file("language_identification/lid.176.ftz", package = "fastText") # # dtbl_res_in = fastText::language_identification(input_obj = wili_test_x$V1, # pre_trained_language_model_path = file_ftz, # k = 1, # th = 0.0, # threads = 1, # verbose = TRUE) ## ----echo = T----------------------------------------------------------------- # # dtbl_res_in$true_label = wili_test_y$V1 # # dtbl_res_in # # isocodes_fasttext_subs = isocodes_fasttext[, c(1,3)] # merge the predicted labels with the 3-letter isocodes # # merg_labels = merge(dtbl_res_in, isocodes_fasttext_subs, by.x = 'iso_lang_1', by.y = 'Alpha_2') # # as.vector(colSums(is.na(merg_labels))) # # print_accuracy(size_input_data = nrow(wili_test_y), # true_data = merg_labels$true_label, # preds_data = merg_labels$Alpha_3_B, # method = 'fastText (.ftz pre-trained model)') # ## ----echo = T----------------------------------------------------------------- # # file_bin = file.path(dir_wili_2018, 'lid.176.bin') # # dtbl_res_in = fastText::language_identification(input_obj = wili_test_x$V1, # pre_trained_language_model_path = file_bin, # k = 1, # th = 0.0, # threads = 1, # verbose = TRUE) ## ----echo = T----------------------------------------------------------------- # # dtbl_res_in$true_label = wili_test_y$V1 # # dtbl_res_in # # isocodes_fasttext_subs = isocodes_fasttext[, c(1,3)] # merge the predicted labels with the 3-letter isocodes # # merg_labels = merge(dtbl_res_in, isocodes_fasttext_subs, by.x = 'iso_lang_1', by.y = 'Alpha_2') # # as.vector(colSums(is.na(merg_labels))) # # print_accuracy(size_input_data = nrow(wili_test_y), # true_data = merg_labels$true_label, # preds_data = merg_labels$Alpha_3_B, # method = 'fastText (.ftz pre-trained model)') # ## ----echo = T----------------------------------------------------------------- # # tbl = table(merg_labels$true_label, merg_labels$Alpha_3_B) # # df = as.data.frame.table(tbl) # colnames(df) = c('country_vert', 'country_horiz', 'Freq') # # head(df) # # require(magrittr) # require(dplyr) # require(ggplot2) # # df <- df %>% # mutate(country_vert = factor(country_vert), # alphabetical order by default # country_horiz = factor(country_horiz, levels = rev(unique(country_horiz)))) # # plt_tbl = ggplot(df, aes(x=country_vert, y=country_horiz, fill=Freq)) + # geom_tile() + theme_bw() + coord_equal() + # scale_fill_distiller(palette="Greens", direction=1) + # ggplot2::theme(axis.text.x = element_text(angle = 45, vjust = 1.0, hjust = 1.0)) # # plt_tbl # ## ----echo = T----------------------------------------------------------------- # # require(cld2) # # t_start = proc.time() # cld2_vec = cld2::detect_language(text = wili_test_x$V1, plain_text = TRUE, lang_code = TRUE) # # cld2_dtbl = data.table::setDT(list(Alpha_2 = cld2_vec)) # cld2_dtbl$true_label = wili_test_y$V1 # # merg_labels_cld2 = merge(cld2_dtbl, isocodes_fasttext_subs, by = 'Alpha_2') # # as.vector(colSums(is.na(merg_labels_cld2))) # # print_accuracy(size_input_data = nrow(wili_test_y), # true_data = merg_labels_cld2$true_label, # preds_data = merg_labels_cld2$Alpha_3_B, # method = 'cld2') # ## ----echo = F----------------------------------------------------------------- # # fastText:::compute_elapsed_time(t_start) # ## ----echo = T----------------------------------------------------------------- # # require(cld3) # # t_start = proc.time() # cld3_vec = cld3::detect_language(text = wili_test_x$V1) # # cld3_dtbl = data.table::setDT(list(Alpha_2 = cld3_vec)) # cld3_dtbl$true_label = wili_test_y$V1 # # merg_labels_cld3 = merge(cld3_dtbl, isocodes_fasttext_subs, by = 'Alpha_2') # # as.vector(colSums(is.na(merg_labels_cld3))) # # print_accuracy(size_input_data = nrow(wili_test_y), # true_data = merg_labels_cld3$true_label, # preds_data = merg_labels_cld3$Alpha_3_B, # method = 'cld3') # ## ----echo = F----------------------------------------------------------------- # # fastText:::compute_elapsed_time(t_start) # ## ----echo = T----------------------------------------------------------------- # # threads = parallel::detectCores() # require(textcat) # # names(textcat::TC_byte_profiles) # ## ----------------------------------------------------------------------------- # # nams_profiles = as.vector(unlist(lapply(strsplit(names(textcat::TC_byte_profiles), '-'), function(x) x[1]))) # nams_profiles = unique(nams_profiles) # cat(glue::glue("Isocode-Names: {length(unique(isocodes_fasttext$Name_tolower))} TC_byte_profiles: {length(names(textcat::TC_byte_profiles))} Intersected Names: {length(intersect(nams_profiles, unique(isocodes_fasttext$Name_tolower)))}"), '\n') # ## ----echo = T----------------------------------------------------------------- # # t_start = proc.time() # textc = as.vector(unlist(parallel::mclapply(1:length(wili_test_x$V1), function(x) { # textcat(x = wili_test_x$V1[x], p = textcat::TC_byte_profiles, method = "CT") # }, mc.cores = threads))) # # textc = as.vector(unlist(lapply(strsplit(textc, '-'), function(x) x[1]))) # textc = trimws(textc, which = 'both') # # unique(textc) ## ----echo = T----------------------------------------------------------------- # # textc_dtbl = data.table::setDT(list(Name_tolower = textc)) # textc_dtbl$true_label = wili_test_y$V1 # # fasttext_isoc_name = isocodes_fasttext[, c(1,5)] # merg_labels_textc = merge(textc_dtbl, fasttext_isoc_name, by = 'Name_tolower') # # as.vector(colSums(is.na(merg_labels_cld2))) # # print_accuracy(size_input_data = nrow(wili_test_y), # true_data = merg_labels_textc$true_label, # preds_data = merg_labels_textc$Alpha_3_B, # method = 'textcat ( TC_byte_profiles )') ## ----echo = F----------------------------------------------------------------- # # fastText:::compute_elapsed_time(t_start) # ## ----echo = T----------------------------------------------------------------- # # names(textcat::TC_char_profiles) # ## ----------------------------------------------------------------------------- # # nams_profiles = as.vector(unlist(lapply(strsplit(names(textcat::TC_char_profiles), '-'), function(x) x[1]))) # nams_profiles = unique(nams_profiles) # cat(glue::glue("Isocode-Names: {length(unique(isocodes_fasttext$Name_tolower))} TC_char_profiles: {length(names(textcat::TC_char_profiles))} Intersected Names: {length(intersect(nams_profiles, unique(isocodes_fasttext$Name_tolower)))}"), '\n') # ## ----echo = T----------------------------------------------------------------- # # t_start = proc.time() # textc = as.vector(unlist(parallel::mclapply(1:length(wili_test_x$V1), function(x) { # textcat(x = wili_test_x$V1[x], p = textcat::TC_char_profiles, method = "CT") # }, mc.cores = threads))) # # textc = as.vector(unlist(lapply(strsplit(textc, '-'), function(x) x[1]))) # textc = trimws(textc, which = 'both') # # unique(textc) ## ----echo = T----------------------------------------------------------------- # # textc_dtbl = data.table::setDT(list(Name_tolower = textc)) # textc_dtbl$true_label = wili_test_y$V1 # # fasttext_isoc_name = isocodes_fasttext[, c(1,5)] # merg_labels_textc = merge(textc_dtbl, fasttext_isoc_name, by = 'Name_tolower') # # as.vector(colSums(is.na(merg_labels_cld2))) # # print_accuracy(size_input_data = nrow(wili_test_y), # true_data = merg_labels_textc$true_label, # preds_data = merg_labels_textc$Alpha_3_B, # method = 'textcat ( TC_char_profiles )') ## ----echo = F----------------------------------------------------------------- # # fastText:::compute_elapsed_time(t_start) # ## ----echo = T----------------------------------------------------------------- # # require(franc) # # t_start = proc.time() # franc_res = as.vector(unlist(parallel::mclapply(1:length(wili_test_x$V1), function(x) { # franc(text = wili_test_x$V1[x], min_speakers = 0, min_length = 10, max_length = 2048) # }, mc.cores = threads))) # # franc_dtbl = data.table::setDT(list(franc = franc_res, true_label = wili_test_y$V1)) # # as.vector(colSums(is.na(franc_dtbl))) # # print_accuracy(size_input_data = nrow(wili_test_y), # true_data = franc_dtbl$true_label, # preds_data = franc_dtbl$franc, # method = 'franc') # ## ----echo = F----------------------------------------------------------------- # # fastText:::compute_elapsed_time(t_start) # ## ----echo = F----------------------------------------------------------------- # # dtbl_bench = data.table::setDT(list(method = c('fastText (ftz)', 'fastText (bin)', 'cld2', 'cld3', 'textcat (byte)', 'textcat (char)', 'franc'), # rows = c(50500, 50500, 50500, 50500, 50500, 50500, 50500), # pred_rows = c(50211, 50168, 34254, 43560, 47324, 43265, 50500), # pred_perc = c(99.43, 99.34, 67.83, 86.26, 93.71, 85.67, 100.0), # NAs = c(289, 332, 16246, 6940, 3176, 7235, 0), # accuracy = c(83.05, 86.55, 83.13, 74.74, 29.91, 31.1, 62.04), # seconds = c(5, 5, 2, 18, 83, 100, 179), # threads = c(1, 1, 1, 1, 8, 8, 8))) ## ----echo = F----------------------------------------------------------------- # dtbl_bench = dtbl_bench[order(dtbl_bench$accuracy, decreasing = T), ] # dtbl_bench # ## ----echo = F----------------------------------------------------------------- # dtbl_bench = dtbl_bench[order(dtbl_bench$pred_perc, decreasing = T), ] # dtbl_bench # ## ----echo = F----------------------------------------------------------------- # dtbl_bench = dtbl_bench[order(dtbl_bench$NAs, decreasing = F), ] # dtbl_bench # ## ----echo = F----------------------------------------------------------------- # dtbl_bench = dtbl_bench[order(dtbl_bench$seconds, decreasing = F), ] # dtbl_bench # ## ----eval = F, echo = T------------------------------------------------------- # # dir_files = file.path(dir_wili_2018, 'declaration_human_rights') # ## ----echo = T----------------------------------------------------------------- # # lst_files = list.files(dir_files, full.names = T, pattern = '.pdf') # # decl_dat = lapply(1:length(lst_files), function(x) { # # iter_dat = pdftools::pdf_text(pdf = lst_files[x]) # lang = trimws(unlist(strsplit(gsub('.pdf', '', basename(lst_files[x])), '_')), which = 'both') # lang = lang[length(lang)] # vec_txt = as.vector(unlist(trimws(iter_dat, which = 'both'))) # vec_txt = as.vector(sapply(vec_txt, function(x) gsub('\n', '', x))) # # idx_lang = which(isocodes_fasttext$Name_tolower == lang) # isocode_3_language = rep(isocodes_fasttext$Alpha_3_B[idx_lang], length(vec_txt)) # isocode_2_language = rep(isocodes_fasttext$Alpha_2[idx_lang], length(vec_txt)) # language = rep(lang, length(vec_txt)) # # dtbl = data.table::setDT(list(isocode_3_language = isocode_3_language, # isocode_2_language = isocode_2_language, # language = language, # text = vec_txt)) # dtbl # }) # # decl_dat = data.table::rbindlist(decl_dat) # ## ----echo = F----------------------------------------------------------------- # # decl_dat$language # decl_dat$isocode_3_language # decl_dat$isocode_2_language # ## ----echo = T----------------------------------------------------------------- # # dtbl_res_in = fastText::language_identification(input_obj = decl_dat$text, # pre_trained_language_model_path = file_ftz, # k = 1, # th = 0.0, # threads = 1, # verbose = TRUE) ## ----echo = F----------------------------------------------------------------- # # dtbl_res_in # ## ----echo = T----------------------------------------------------------------- # # print_accuracy(size_input_data = length(dtbl_res_in$iso_lang_1), # true_data = decl_dat$isocode_2_language, # preds_data = dtbl_res_in$iso_lang_1, # method = 'fastText (.ftz pre-trained model)') # ## ----echo = T----------------------------------------------------------------- # # cld2_vec = cld2::detect_language(text = decl_dat$text, # plain_text = TRUE, # lang_code = TRUE) # cld2_vec # ## ----echo = T----------------------------------------------------------------- # # print_accuracy(size_input_data = nrow(decl_dat), # true_data = decl_dat$isocode_2_language, # preds_data = cld2_vec, # method = 'cld2') # ## ----echo = T----------------------------------------------------------------- # # cld3_vec = cld3::detect_language(text = decl_dat$text) # cld3_vec # ## ----echo = T----------------------------------------------------------------- # # print_accuracy(size_input_data = nrow(decl_dat), # true_data = decl_dat$isocode_2_language, # preds_data = cld3_vec, # method = 'cld3') # ## ----echo = T----------------------------------------------------------------- # # textc = textcat(x = decl_dat$text, p = textcat::TC_byte_profiles, method = "CT") # textc # ## ----echo = T----------------------------------------------------------------- # # textc = as.vector(unlist(lapply(strsplit(textc, '-'), function(x) x[1]))) # textc = trimws(textc, which = 'both') # textc # ## ----echo = T----------------------------------------------------------------- # # print_accuracy(size_input_data = nrow(decl_dat), # true_data = decl_dat$language, # preds_data = textc, # method = 'textcat') # ## ----echo = T----------------------------------------------------------------- # # franc_vec = as.vector(sapply(decl_dat$text, function(x) { # franc(text = x, min_length = 10, max_length = 2048) # })) # # franc_vec # ## ----echo = T----------------------------------------------------------------- # # print_accuracy(size_input_data = nrow(decl_dat), # true_data = decl_dat$isocode_3_language, # preds_data = franc_vec, # method = 'franc') # ## ----echo = T----------------------------------------------------------------- # # dtbl_out = decl_dat[, 1:3] # colnames(dtbl_out) = c('true_y_iso_3', 'true_y_iso_2', 'true_y_language') # # dtbl_out # # dtbl_preds = data.table::setDT(list(fastText = dtbl_res_in$iso_lang_1, # cld2 = cld2_vec, # cld3 = cld3_vec, # textcat = textc, # franc = franc_vec)) # # dtbl_preds # # dtbl_out = cbind(dtbl_out, dtbl_preds) # dtbl_out # ## ----echo = T----------------------------------------------------------------- # # lst_files = list.files(dir_files, full.names = F, pattern = '.pdf') # # min_letters_en_es = 3 # min. number of characters for the 'en' and 'es' languages # sample_words = 100 # sample that many words from each tokenized file # # decl_dat = lapply(1:length(lst_files), function(x) { # # iter_dat = pdftools::pdf_text(pdf = file.path(dir_files, lst_files[x])) # # dat_txt = sapply(iter_dat, function(y) { # # if (lst_files[x] == 'declaration_human_rights_chinese.pdf') { # res_spl_lang = stringi::stri_split_boundaries(str = y, # type = 'word', # skip_word_none = TRUE, # skip_word_letter = TRUE, # skip_word_number = TRUE) # } # else { # res_spl_lang = stringi::stri_split(str = y, # regex = '[ \n,]', # omit_empty = TRUE, # tokens_only = TRUE) # } # # res_spl_lang = trimws(res_spl_lang[[1]], which = 'both') # idx_empty = which(res_spl_lang == "") # if (length(idx_empty) > 0) { # res_spl_lang = res_spl_lang[-idx_empty] # } # if (!is.null(min_letters_en_es) & lst_files[x] != 'declaration_human_rights_chinese.pdf') { # nchars = nchar(res_spl_lang) # idx_chars = which(nchars >= min_letters_en_es) # if (length(idx_chars) > 0) { # res_spl_lang = res_spl_lang[idx_chars] # } # } # res_spl_lang # }) # # dat_txt = as.vector(unlist(dat_txt)) # set.seed(1) # sample_words = sample(dat_txt, sample_words) # sample_words # }) # # # decl_dat = as.vector(unlist(decl_dat)) # decl_dat = decl_dat[sample(1:length(decl_dat), length(decl_dat))] # multilingual_sentence = paste(decl_dat, collapse = ' ') # multilingual_sentence # ## ----echo = T----------------------------------------------------------------- # # num_languages = 3 # ## ----echo = T----------------------------------------------------------------- # # dtbl_multiling = fastText::language_identification(input_obj = multilingual_sentence, # pre_trained_language_model_path = file_ftz, # k = num_languages, # th = 0.0, # threads = 1, # verbose = FALSE) # dtbl_multiling # ## ----echo = T----------------------------------------------------------------- # # cld2::detect_language_mixed(text = multilingual_sentence, plain_text = TRUE)$classification # ## ----echo = T----------------------------------------------------------------- # # cld3::detect_language_mixed(text = multilingual_sentence, size = num_languages) # ## ----echo = T----------------------------------------------------------------- # # # we could use the 'whitelist' parameter but the purpose is to identify languages from unknown text # # franc::franc_all(text = multilingual_sentence, max_length = nchar(multilingual_sentence) + 1)[1:num_languages, ] #