## ----setup, include=FALSE, cache=FALSE------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ options(width = 1000) knitr::opts_chunk$set(echo = TRUE, message = FALSE, comment = NA, eval = TRUE) ## ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- library(udpipe) dl <- udpipe_download_model(language = "dutch") str(dl) ## ---- echo=FALSE---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- knitr::opts_chunk$set(echo = TRUE, message = FALSE, comment = NA, eval = !dl$download_failed) ## ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- ## Either give a file in the current working directory udmodel_dutch <- udpipe_load_model(file = "dutch-alpino-ud-2.5-191206.udpipe") ## Or give the full path to the file udmodel_dutch <- udpipe_load_model(file = dl$file_model) ## ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- txt <- c("Ik ben de weg kwijt, kunt u me zeggen waar de Lange Wapper ligt? Jazeker meneer", "Het gaat vooruit, het gaat verbazend goed vooruit") x <- udpipe_annotate(udmodel_dutch, x = txt) x <- as.data.frame(x) str(x) table(x$upos) ## ---- results='hide'------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ ## Tokenization + finds sentences, does not execute POS tagging, nor lemmatization or dependency parsing x <- udpipe_annotate(udmodel_dutch, x = txt, tagger = "none", parser = "none") x <- as.data.frame(x) table(x$upos) table(x$dep_rel) ## Tokenization + finds sentences, does POS tagging and lemmatization but does not execute dependency parsing x <- udpipe_annotate(udmodel_dutch, x = txt, tagger = "default", parser = "none") x <- as.data.frame(x) table(x$upos) table(x$dep_rel) ## Tokenization + finds sentences and executes dependency parsing but does not do POS tagging nor lemmatization x <- udpipe_annotate(udmodel_dutch, x = txt, tagger = "none", parser = "default") x <- as.data.frame(x) table(x$upos) table(x$dep_rel) ## ---- results='hide'------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ ## Either put every token on a new line and use tokenizer: vertical input <- list(doc1 = c("Ik", "ben", "de", "weg", "kwijt", ",", "kunt", "u", "me", "zeggen", "waar", "de", "Lange Wapper", "ligt", "?", "Jazeker", "meneer"), doc2 = c("Het", "gaat", "vooruit", ",", "het", "gaat", "verbazend", "goed", "vooruit")) txt <- sapply(input, FUN=function(x) paste(x, collapse = "\n")) x <- udpipe_annotate(udmodel_dutch, x = txt, tokenizer = "vertical") x <- as.data.frame(x) ## Or put every token of each document in 1 string separated by a space and use tokenizer: horizontal ## Mark that if a token contains a space, you need to replace the space ## with the 'NO-BREAK SPACE' (U+00A0) character to make sure it is still considered as one token txt <- sapply(input, FUN=function(x){ x <- gsub(" ", intToUtf8(160), x) ## replace space with no-break-space paste(x, collapse = " ") }) x <- udpipe_annotate(udmodel_dutch, x = as.character(txt), tokenizer = "horizontal") x <- as.data.frame(x) ## ---- eval=FALSE---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # dl <- udpipe_download_model(language = "sanskrit", udpipe_model_repo = "jwijffels/udpipe.models.ud.2.0") # udmodel_sanskrit <- udpipe_load_model(file = dl$file_model) # txt <- "ततः असौ प्राह क्षत्रियस्य तिस्रः भार्या धर्मम् भवन्ति तत् एषा कदाचिद् वैश्या सुता भविष्यति तत् अनुरागः ममास्याम् ततः रथकारः तस्य निश्चयम् विज्ञायावदत् वयस्य किम् अ धुना कर्तव्यम् कौलिकः आह किम् अहम् जानामि त्वयि मित्रे यत् अभिहितं मया ततः" # x <- udpipe_annotate(udmodel_sanskrit, x = txt) # Encoding(x$conllu) # x <- as.data.frame(x) ## ---- eval=FALSE---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- # x <- udpipe_annotate(udmodel_sanskrit, x = txt) # cat(x$conllu, file = "myannotation.conllu") ## ---- results='hide', echo=FALSE------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ invisible(if(file.exists(dl$file_model)) file.remove(dl$file_model))