## ----setup, include = FALSE--------------------------------------------------- # only evaluate code if "NOT_CRAN" NOT_CRAN <- identical(tolower(Sys.getenv("NOT_CRAN")), "true") knitr::opts_chunk$set( collapse = TRUE, comment = "#>" ) if(NOT_CRAN){ if(is.na(rtika::tika_jar())){ rtika::install_tika() } } ## ---- eval=NOT_CRAN----------------------------------------------------------- library('rtika') library('magrittr') # Code to get ALL the files in my_path: # my_path <- "~" # batch <- file.path(my_path, # list.files(path = my_path, # recursive = TRUE)) # pipe the batch into tika_text() # to get plain text # test files batch <- c( system.file("extdata", "jsonlite.pdf", package = "rtika"), system.file("extdata", "curl.pdf", package = "rtika"), system.file("extdata", "table.docx", package = "rtika"), system.file("extdata", "xml2.pdf", package = "rtika"), system.file("extdata", "R-FAQ.html", package = "rtika"), system.file("extdata", "calculator.jpg", package = "rtika"), system.file("extdata", "tika.apache.org.zip", package = "rtika") ) text <- batch %>% tika_text() # normal syntax also works: # text <- tika_text(batch) ## ---- eval=NOT_CRAN----------------------------------------------------------- # Find which files had an issue # Handle them if needed batch[which(is.na(text))] ## ---- eval=NOT_CRAN----------------------------------------------------------- length(text) search <- text[grep(pattern = ' is ', x = text)] length(search) ## ---- eval=NOT_CRAN----------------------------------------------------------- download_directory <- tempfile('rtika_') dir.create(download_directory) urls <- c('https://tika.apache.org/', 'https://cran.rstudio.com/web/packages/keras/keras.pdf') downloaded <- urls %>% tika_fetch(download_directory) # it will add the appropriate file extension to the downloads downloaded ## ---- eval=NOT_CRAN----------------------------------------------------------- # create a directory not already in use. my_directory <- tempfile('rtika_') dir.create(my_directory) # pipe the batch to tika_text() batch %>% tika_text(threads = 4, return = FALSE, output_dir = my_directory) # list all the file locations processed_files <- file.path( normalizePath(my_directory), list.files(path = my_directory, recursive = TRUE) ) ## ---- eval=NOT_CRAN----------------------------------------------------------- processed_files ## ---- eval=NOT_CRAN----------------------------------------------------------- library('xml2') # get XHTML text html <- batch %>% tika_html() %>% lapply(xml2::read_html) # parse links from documents links <- html %>% lapply(xml2::xml_find_all, '//a') %>% lapply(xml2::xml_attr, 'href') sample(links[[1]],10) ## ---- eval=NOT_CRAN----------------------------------------------------------- # Content-Type html %>% lapply(xml2::xml_find_first, '//meta[@name="Content-Type"]') %>% lapply(xml2::xml_attr, 'content') %>% unlist() # Creation-Date html %>% lapply(xml2::xml_find_first, '//meta[@name="Creation-Date"]') %>% lapply(xml2::xml_attr, 'content') %>% unlist() ## ---- eval=NOT_CRAN----------------------------------------------------------- library('jsonlite') # batch <- system.file("extdata", "calculator.jpg", package = "rtika") # a list of data.frames metadata <- batch %>% tika_json() %>% lapply(jsonlite::fromJSON) # look at metadata for an image str(metadata[[6]]) ## ---- eval=NOT_CRAN----------------------------------------------------------- metadata[[6]]$'geo:lat' metadata[[6]]$'geo:long' ## ---- eval=NOT_CRAN----------------------------------------------------------- # wget gets a webpage and other files. # sys::exec_wait('wget', c('--page-requisites', 'https://tika.apache.org/')) # Put it all into a .zip file # sys::exec_wait('zip', c('-r', 'tika.apache.org.zip' ,'tika.apache.org')) batch <- system.file("extdata", "tika.apache.org.zip", package = "rtika") # a list of data.frames metadata <- batch %>% tika_json() %>% lapply(jsonlite::fromJSON) # The structure is very long. See it on your own with: str(metadata) ## ---- eval=NOT_CRAN----------------------------------------------------------- # the 'X-TIKA:embedded_resource_path' field embedded_resource_path <- metadata %>% lapply(function(x){ x$'X-TIKA:embedded_resource_path' }) embedded_resource_path ## ---- eval=NOT_CRAN----------------------------------------------------------- content_type <- metadata %>% lapply(function(x){ x$'Content-Type' }) content_type ## ---- eval=NOT_CRAN----------------------------------------------------------- content <- metadata %>% lapply(function(x){ x$'X-TIKA:content' }) str(content)