## ----setup, include = FALSE---------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  eval = FALSE,
  fig.width = 7,
  fig.height = 5
)

## ----installation-------------------------------------------------------------
# install.packages("SportMiner")

## ----install-dev--------------------------------------------------------------
# # install.packages("devtools")
# devtools::install_github("praveenchougale/SportMiner")

## ----api-setup----------------------------------------------------------------
# library(SportMiner)
# 
# # Option 1: Set directly in session
# sm_set_api_key("your_api_key_here")
# 
# # Option 2: Store in .Renviron (recommended)
# # usethis::edit_r_environ()
# # Add: SCOPUS_API_KEY=your_api_key_here
# # Restart R, then:
# sm_set_api_key()

## ----query-basics-------------------------------------------------------------
# # Search in title, abstract, and keywords
# query_basic <- 'TITLE-ABS-KEY("machine learning" AND "sports")'
# 
# # Search specific fields
# query_title <- 'TITLE("performance prediction")'
# query_abstract <- 'ABS("neural networks")'
# query_keywords <- 'KEY("injury prevention")'

## ----query-advanced-----------------------------------------------------------
# # Complex query with multiple conditions
# query <- paste0(
#   'TITLE-ABS-KEY(',
#   '("machine learning" OR "deep learning" OR "artificial intelligence") ',
#   'AND ("sports" OR "athlete*" OR "performance") ',
#   'AND NOT "e-sports"',
#   ') ',
#   'AND DOCTYPE(ar) ',                    # Articles only
#   'AND PUBYEAR > 2018 ',                 # Published after 2018
#   'AND LANGUAGE(english) ',              # English only
#   'AND SUBJAREA(MEDI OR HEAL OR COMP)'   # Relevant subject areas
# )

## ----search-execution---------------------------------------------------------
# papers <- sm_search_scopus(
#   query = query,
#   max_count = 200,
#   batch_size = 100,
#   view = "COMPLETE",
#   verbose = TRUE
# )
# 
# # Inspect results
# dim(papers)
# head(papers[, c("title", "year", "author_keywords")])

## ----preprocess---------------------------------------------------------------
# processed_data <- sm_preprocess_text(
#   data = papers,
#   text_col = "abstract",
#   doc_id_col = "doc_id",
#   min_word_length = 3
# )
# 
# head(processed_data)

## ----dtm----------------------------------------------------------------------
# dtm <- sm_create_dtm(
#   word_counts = processed_data,
#   min_term_freq = 3,
#   max_term_freq = 0.5
# )
# 
# # Matrix dimensions
# print(paste("Documents:", dtm$nrow, "| Terms:", dtm$ncol))
# 
# # Sparsity
# sparsity <- 100 * (1 - slam::row_sums(dtm > 0) / (dtm$nrow * dtm$ncol))
# print(paste("Sparsity:", round(sparsity, 2), "%"))

## ----optimal-k----------------------------------------------------------------
# k_selection <- sm_select_optimal_k(
#   dtm = dtm,
#   k_range = seq(4, 20, by = 2),
#   method = "gibbs",
#   plot = TRUE
# )
# 
# # View results
# print(k_selection$metrics)
# print(paste("Optimal k:", k_selection$optimal_k))

## ----train-lda----------------------------------------------------------------
# lda_model <- sm_train_lda(
#   dtm = dtm,
#   k = k_selection$optimal_k,
#   method = "gibbs",
#   iter = 2000,
#   alpha = 50 / k_selection$optimal_k,  # Symmetric Dirichlet prior
#   seed = 1729
# )
# 
# # Examine top terms per topic
# terms_matrix <- topicmodels::terms(lda_model, 10)
# print(terms_matrix)

## ----compare-models-----------------------------------------------------------
# comparison <- sm_compare_models(
#   dtm = dtm,
#   k = 10,
#   seed = 1729,
#   verbose = TRUE
# )
# 
# # View metrics
# print(comparison$metrics)
# print(paste("Recommended model:", comparison$recommendation))
# 
# # Extract best model
# best_model <- comparison$models[[tolower(comparison$recommendation)]]

## ----plot-terms, fig.cap="Top terms per topic with beta weights"--------------
# plot_terms <- sm_plot_topic_terms(
#   model = lda_model,
#   n_terms = 10
# )
# print(plot_terms)

## ----plot-frequency, fig.cap="Document distribution across topics"------------
# plot_freq <- sm_plot_topic_frequency(
#   model = lda_model,
#   dtm = dtm
# )
# print(plot_freq)

## ----plot-trends, fig.cap="Topic prevalence trends over time"-----------------
# # Ensure papers have doc_id matching DTM rownames
# papers$doc_id <- rownames(dtm)
# 
# plot_trends <- sm_plot_topic_trends(
#   model = lda_model,
#   dtm = dtm,
#   metadata = papers,
#   year_col = "year",
#   doc_id_col = "doc_id"
# )
# print(plot_trends)

## ----keyword-network, fig.cap="Author keyword co-occurrence network"----------
# network_plot <- sm_keyword_network(
#   data = papers,
#   keyword_col = "author_keywords",
#   min_cooccurrence = 3,
#   top_n = 30
# )
# print(network_plot)

## ----custom-preprocess--------------------------------------------------------
# processed_custom <- sm_preprocess_text(
#   data = papers,
#   text_col = "abstract",
#   doc_id_col = "doc_id",
#   min_word_length = 4,      # Longer minimum word length
#   custom_stopwords = c("study", "research", "paper")  # Additional stopwords
# )

## ----hyperparameters----------------------------------------------------------
# # Test different alpha values
# alphas <- c(0.1, 0.5, 1.0)
# results <- lapply(alphas, function(a) {
#   model <- sm_train_lda(dtm, k = 10, alpha = a, seed = 1729)
#   perplexity <- topicmodels::perplexity(model, dtm)
#   list(alpha = a, perplexity = perplexity)
# })
# 
# # Compare results
# do.call(rbind, results)

## ----export-------------------------------------------------------------------
# # Save model
# saveRDS(lda_model, "lda_model.rds")
# 
# # Save plots
# ggplot2::ggsave("topic_terms.png", plot_terms,
#                 width = 12, height = 8, dpi = 300)
# ggplot2::ggsave("topic_trends.png", plot_trends,
#                 width = 12, height = 6, dpi = 300)
# 
# # Export document-topic assignments
# topics <- topicmodels::topics(lda_model, 1)
# papers$dominant_topic <- paste0("Topic_", topics)
# write.csv(papers, "papers_with_topics.csv", row.names = FALSE)
# 
# # Export topic-term matrix
# beta <- topicmodels::posterior(lda_model)$terms
# write.csv(beta, "topic_term_matrix.csv")

## ----case-study---------------------------------------------------------------
# # Comprehensive search query
# query_case <- paste0(
#   'TITLE-ABS-KEY(',
#   '("sports analytics" OR "sports data science" OR "sports informatics" OR ',
#   '"performance analysis" OR "match analysis") ',
#   'AND ("data" OR "analytics" OR "statistics" OR "modeling")',
#   ') ',
#   'AND DOCTYPE(ar OR re) ',
#   'AND PUBYEAR > 2013 ',
#   'AND LANGUAGE(english)'
# )
# 
# # Retrieve papers
# papers_case <- sm_search_scopus(query_case, max_count = 500, verbose = TRUE)
# 
# # Full preprocessing pipeline
# processed_case <- sm_preprocess_text(papers_case, text_col = "abstract")
# dtm_case <- sm_create_dtm(processed_case, min_term_freq = 5, max_term_freq = 0.4)
# 
# # Model selection
# k_case <- sm_select_optimal_k(dtm_case, k_range = seq(6, 18, by = 2), plot = TRUE)
# 
# # Train final model
# model_case <- sm_train_lda(dtm_case, k = k_case$optimal_k,
#                            iter = 2000, seed = 1729)
# 
# # Visualizations
# terms_plot <- sm_plot_topic_terms(model_case, n_terms = 12)
# trends_plot <- sm_plot_topic_trends(model_case, dtm_case, papers_case)

## ----benchmarks---------------------------------------------------------------
# # Test on varying document sizes
# sizes <- c(100, 500, 1000, 2000)
# times <- sapply(sizes, function(n) {
#   subset_dtm <- dtm_case[1:min(n, dtm_case$nrow), ]
#   system.time({
#     sm_train_lda(subset_dtm, k = 10, iter = 1000)
#   })["elapsed"]
# })
# 
# # Display results
# data.frame(documents = sizes, time_seconds = times)

## ----reproducibility----------------------------------------------------------
# sm_train_lda(dtm, k = 10, seed = 1729)
# sm_compare_models(dtm, k = 10, seed = 1729)

## ----custom-viz---------------------------------------------------------------
# library(ggplot2)
# 
# # Customize theme parameters
# plot_terms + theme_sportminer(base_size = 14, grid = FALSE)

## ----session-info-------------------------------------------------------------
# sessionInfo()