## ----setup, include=FALSE-----------------------------------------------------
# CRAN will not have spaCy installed, so create static vignette
knitr::opts_chunk$set(eval = FALSE)

## ----echo = FALSE, message=FALSE----------------------------------------------
#  library(magrittr)
#  library(dplyr)
#  library(ggplot2)
#  library(cleanNLP)
#  library(jsonlite)
#  library(stringi)
#  library(xml2)

## -----------------------------------------------------------------------------
#  grab_wiki <- function(lang, page) {
#    url <- sprintf(
#      "https://%s.wikipedia.org/w/api.php?action=parse&format=json&page=%s",
#      lang,
#      page)
#    page_json <- jsonlite::fromJSON(url)$parse$text$"*"
#    page_xml <- xml2::read_xml(page_json, asText=TRUE)
#    page_text <- xml_text(xml_find_all(page_xml, "//div/p"))
#  
#    page_text <- stri_replace_all(page_text, "", regex="\\[[0-9]+\\]")
#    page_text <- stri_replace_all(page_text, " ", regex="\n")
#    page_text <- stri_replace_all(page_text, " ", regex="[ ]+")
#    page_text <- page_text[stri_length(page_text) > 10]
#  
#    return(page_text)
#  }
#  
#  penguin <- grab_wiki("en", "penguin")
#  penguin[1:10] # just show the first 10 paragraphs

## -----------------------------------------------------------------------------
#  cnlp_init_udpipe()
#  anno <- cnlp_annotate(penguin, verbose=FALSE)
#  anno$token

## -----------------------------------------------------------------------------
#  token <- anno$token
#  token$new_token <- token$token_with_ws
#  change_these <- which(token$xpos %in% c("NNP", "NNPS"))
#  token$new_token[change_these] <- stri_trans_toupper(token$new_token[change_these])

## -----------------------------------------------------------------------------
#  paragraphs <- tapply(token$new_token, token$doc_id, paste, collapse="")[1:10]
#  paragraphs <- stri_wrap(paragraphs, simplify=FALSE, exdent = 1)
#  cat(unlist(lapply(paragraphs, function(v) c(v, ""))), sep="\n")