## ----setup, include=FALSE, cache=FALSE------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ options(width = 1000) knitr::opts_chunk$set(echo = TRUE, message = FALSE, comment = NA, eval = TRUE) ## ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- file_conllu <- system.file(package = "udpipe", "dummydata", "traindata.conllu") file_conllu cat(head(readLines(file_conllu), 3), sep="\n") ## ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- library(udpipe) m <- udpipe_train(file = "toymodel.udpipe", files_conllu_training = file_conllu, annotation_tokenizer = list(dimension = 16, epochs = 1, batch_size = 100, dropout = 0.7), annotation_tagger = list(iterations = 1, models = 1, provide_xpostag = 1, provide_lemma = 0, provide_feats = 0), annotation_parser = "none") m$file_model ## The model is now trained and saved in file toymodel.udpipe in the current working directory ## Now we can use the model to annotate some text mymodel <- udpipe_load_model("toymodel.udpipe") x <- udpipe_annotate( object = mymodel, x = "Dit is een tokenizer met POS tagging, zonder lemmatisation noch laat deze dependency parsing toe.", parser = "none") str(as.data.frame(x)) ## ---- eval=FALSE---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # m <- udpipe_train(file = "toymodel.udpipe", files_conllu_training = file_conllu, # annotation_tokenizer = "default", # annotation_tagger = "default", # annotation_parser = "default") ## ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- params <- list() ## Tokenizer training parameters params$tokenizer <- list(dimension = 24, epochs = 1, #epochs = 100, initialization_range = 0.1, batch_size = 100, learning_rate = 0.005, dropout = 0.1, early_stopping = 1) ## Tagger training parameters params$tagger <- list(models = 2, templates_1 = "tagger", guesser_suffix_rules_1 = 8, guesser_enrich_dictionary_1 = 6, guesser_prefixes_max_1 = 0, use_lemma_1 = 0, use_xpostag_1 = 1, use_feats_1 = 1, provide_lemma_1 = 0, provide_xpostag_1 = 1, provide_feats_1 = 1, prune_features_1 = 0, templates_2 = "lemmatizer", guesser_suffix_rules_2 = 6, guesser_enrich_dictionary_2 = 4, guesser_prefixes_max_2 = 4, use_lemma_2 = 1, use_xpostag_2 = 0, use_feats_2 = 0, provide_lemma_2 = 1, provide_xpostag_2 = 0, provide_feats_2 = 0, prune_features_2 = 0) ## Dependency parser training parameters params$parser <- list(iterations = 1, #iterations = 30, embedding_upostag = 20, embedding_feats = 20, embedding_xpostag = 0, embedding_form = 50, #embedding_form_file = "../ud-2.0-embeddings/nl.skip.forms.50.vectors", embedding_lemma = 0, embedding_deprel = 20, learning_rate = 0.01, learning_rate_final = 0.001, l2 = 0.5, hidden_layer = 200, batch_size = 10, transition_system = "projective", transition_oracle = "dynamic", structured_interval = 10) ## Train the model m <- udpipe_train(file = "toymodel.udpipe", files_conllu_training = file_conllu, annotation_tokenizer = params$tokenizer, annotation_tagger = params$tagger, annotation_parser = params$parser) ## ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- data(udpipe_annotation_params) str(udpipe_annotation_params$tokenizer) ## Example for training the tokenizer on the Dutch treebank hyperparams_nl <- subset(udpipe_annotation_params$tokenizer, language_treebank == "nl") as.list(hyperparams_nl) ## ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- ## Example for training the tagger on the Dutch treebank hyperparams_nl <- subset(udpipe_annotation_params$tagger, language_treebank == "nl") as.list(hyperparams_nl) ## ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- ## Example for training the dependency parser on the Dutch treebank hyperparams_nl <- subset(udpipe_annotation_params$parser, language_treebank == "nl") as.list(hyperparams_nl) ## ---- eval=FALSE---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # library(utils) # library(udpipe) # library(word2vec) # # ## Work on data from Universal Dependencies - German GSD treebank # settings <- list() # settings$ud.train <- "https://raw.githubusercontent.com/UniversalDependencies/UD_German-GSD/r2.6/de_gsd-ud-train.conllu" # settings$ud.dev <- "https://raw.githubusercontent.com/UniversalDependencies/UD_German-GSD/r2.6/de_gsd-ud-dev.conllu" # settings$ud.test <- "https://raw.githubusercontent.com/UniversalDependencies/UD_German-GSD/r2.6/de_gsd-ud-test.conllu" # # ## Download the conllu files # download.file(url = settings$ud.train, destfile = "train.conllu") # download.file(url = settings$ud.dev, destfile = "dev.conllu") # download.file(url = settings$ud.test, destfile = "test.conllu") # # ## Create wordvectors as these are used for training the dependency parser + save the word vectors to disk # x <- udpipe_read_conllu("train.conllu") # x <- paste.data.frame(x, term = "token", group = c("doc_id", "paragraph_id", "sentence_id"), collapse = " ") # x <- x$token # writeLines(x, con = file("text.txt", encoding = "UTF-8", open = "wt")) # w2v <- word2vec("text.txt", type = "skip-gram", dim = 50, window = 10, min_count = 2, negative = 5, iter = 15, threads = 1) # write.word2vec(w2v, file = "wordvectors.vec", type = "txt", encoding = "UTF-8") # predict(w2v, c("gut", "freundlich"), type = "nearest", top = 20) # # ## Train the model # print(Sys.time()) # m <- udpipe_train(file = "de_gsd-ud-2.6-20200924.udpipe", # files_conllu_training = "train.conllu", # files_conllu_holdout = "dev.conllu", # annotation_tokenizer = list(dimension = 64, epochs = 100, segment_size=200, initialization_range = 0.1, # batch_size = 50, learning_rate = 0.002, learning_rate_final=0, dropout = 0.1, early_stopping = 1), # annotation_tagger = list(models = 2, # templates_1 = "lemmatizer", guesser_suffix_rules_1 = 8, guesser_enrich_dictionary_1 = 4, guesser_prefixes_max_1 = 4, # use_lemma_1 = 1,provide_lemma_1 = 1, use_xpostag_1 = 0, provide_xpostag_1 = 0, # use_feats_1 = 0, provide_feats_1 = 0, prune_features_1 = 1, # templates_2 = "tagger", guesser_suffix_rules_2 = 8, guesser_enrich_dictionary_2 = 4, guesser_prefixes_max_2 = 0, # use_lemma_2 = 1, provide_lemma_2 = 0, use_xpostag_2 = 1, provide_xpostag_2 = 1, # use_feats_2 = 1, provide_feats_2 = 1, prune_features_2 = 1), # annotation_parser = list(iterations = 30, embedding_upostag = 20, embedding_feats = 20, embedding_xpostag = 0, # embedding_form = 50, embedding_form_file = "wordvectors.vec", # embedding_lemma = 0, embedding_deprel = 20, learning_rate = 0.01, # learning_rate_final = 0.001, l2 = 0.5, hidden_layer = 200, # batch_size = 10, transition_system = "projective", transition_oracle = "dynamic", # structured_interval = 8)) # print(Sys.time()) # # ## Evaluate the accuracy # m <- udpipe_load_model("de_gsd-ud-2.6-20200924.udpipe") # goodness_of_fit <- udpipe_accuracy(m, "test.conllu", tokenizer = "default", tagger = "default", parser = "default") # cat(goodness_of_fit$accuracy, sep = "\n") # goodness_of_fit <- udpipe_accuracy(m, "test.conllu", tokenizer = "none", tagger = "default", parser = "default") # cat(goodness_of_fit$accuracy, sep = "\n") # goodness_of_fit <- udpipe_accuracy(m, "test.conllu", tokenizer = "none", tagger = "none", parser = "default") # cat(goodness_of_fit$accuracy, sep = "\n") ## ---- results='hide', echo=FALSE------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ invisible(file.remove(c("toymodel.udpipe")))