## ----setup, include = FALSE--------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>", eval = FALSE, fig.width = 7, fig.height = 5 ) ## ----installation------------------------------------------------------------- # install.packages("SportMiner") ## ----install-dev-------------------------------------------------------------- # # install.packages("devtools") # devtools::install_github("praveenchougale/SportMiner") ## ----api-setup---------------------------------------------------------------- # library(SportMiner) # # # Option 1: Set directly in session # sm_set_api_key("your_api_key_here") # # # Option 2: Store in .Renviron (recommended) # # usethis::edit_r_environ() # # Add: SCOPUS_API_KEY=your_api_key_here # # Restart R, then: # sm_set_api_key() ## ----query-basics------------------------------------------------------------- # # Search in title, abstract, and keywords # query_basic <- 'TITLE-ABS-KEY("machine learning" AND "sports")' # # # Search specific fields # query_title <- 'TITLE("performance prediction")' # query_abstract <- 'ABS("neural networks")' # query_keywords <- 'KEY("injury prevention")' ## ----query-advanced----------------------------------------------------------- # # Complex query with multiple conditions # query <- paste0( # 'TITLE-ABS-KEY(', # '("machine learning" OR "deep learning" OR "artificial intelligence") ', # 'AND ("sports" OR "athlete*" OR "performance") ', # 'AND NOT "e-sports"', # ') ', # 'AND DOCTYPE(ar) ', # Articles only # 'AND PUBYEAR > 2018 ', # Published after 2018 # 'AND LANGUAGE(english) ', # English only # 'AND SUBJAREA(MEDI OR HEAL OR COMP)' # Relevant subject areas # ) ## ----search-execution--------------------------------------------------------- # papers <- sm_search_scopus( # query = query, # max_count = 200, # batch_size = 100, # view = "COMPLETE", # verbose = TRUE # ) # # # Inspect results # dim(papers) # head(papers[, c("title", "year", "author_keywords")]) ## ----preprocess--------------------------------------------------------------- # processed_data <- sm_preprocess_text( # data = papers, # text_col = "abstract", # doc_id_col = "doc_id", # min_word_length = 3 # ) # # head(processed_data) ## ----dtm---------------------------------------------------------------------- # dtm <- sm_create_dtm( # word_counts = processed_data, # min_term_freq = 3, # max_term_freq = 0.5 # ) # # # Matrix dimensions # print(paste("Documents:", dtm$nrow, "| Terms:", dtm$ncol)) # # # Sparsity # sparsity <- 100 * (1 - slam::row_sums(dtm > 0) / (dtm$nrow * dtm$ncol)) # print(paste("Sparsity:", round(sparsity, 2), "%")) ## ----optimal-k---------------------------------------------------------------- # k_selection <- sm_select_optimal_k( # dtm = dtm, # k_range = seq(4, 20, by = 2), # method = "gibbs", # plot = TRUE # ) # # # View results # print(k_selection$metrics) # print(paste("Optimal k:", k_selection$optimal_k)) ## ----train-lda---------------------------------------------------------------- # lda_model <- sm_train_lda( # dtm = dtm, # k = k_selection$optimal_k, # method = "gibbs", # iter = 2000, # alpha = 50 / k_selection$optimal_k, # Symmetric Dirichlet prior # seed = 1729 # ) # # # Examine top terms per topic # terms_matrix <- topicmodels::terms(lda_model, 10) # print(terms_matrix) ## ----compare-models----------------------------------------------------------- # comparison <- sm_compare_models( # dtm = dtm, # k = 10, # seed = 1729, # verbose = TRUE # ) # # # View metrics # print(comparison$metrics) # print(paste("Recommended model:", comparison$recommendation)) # # # Extract best model # best_model <- comparison$models[[tolower(comparison$recommendation)]] ## ----plot-terms, fig.cap="Top terms per topic with beta weights"-------------- # plot_terms <- sm_plot_topic_terms( # model = lda_model, # n_terms = 10 # ) # print(plot_terms) ## ----plot-frequency, fig.cap="Document distribution across topics"------------ # plot_freq <- sm_plot_topic_frequency( # model = lda_model, # dtm = dtm # ) # print(plot_freq) ## ----plot-trends, fig.cap="Topic prevalence trends over time"----------------- # # Ensure papers have doc_id matching DTM rownames # papers$doc_id <- rownames(dtm) # # plot_trends <- sm_plot_topic_trends( # model = lda_model, # dtm = dtm, # metadata = papers, # year_col = "year", # doc_id_col = "doc_id" # ) # print(plot_trends) ## ----keyword-network, fig.cap="Author keyword co-occurrence network"---------- # network_plot <- sm_keyword_network( # data = papers, # keyword_col = "author_keywords", # min_cooccurrence = 3, # top_n = 30 # ) # print(network_plot) ## ----custom-preprocess-------------------------------------------------------- # processed_custom <- sm_preprocess_text( # data = papers, # text_col = "abstract", # doc_id_col = "doc_id", # min_word_length = 4, # Longer minimum word length # custom_stopwords = c("study", "research", "paper") # Additional stopwords # ) ## ----hyperparameters---------------------------------------------------------- # # Test different alpha values # alphas <- c(0.1, 0.5, 1.0) # results <- lapply(alphas, function(a) { # model <- sm_train_lda(dtm, k = 10, alpha = a, seed = 1729) # perplexity <- topicmodels::perplexity(model, dtm) # list(alpha = a, perplexity = perplexity) # }) # # # Compare results # do.call(rbind, results) ## ----export------------------------------------------------------------------- # # Save model # saveRDS(lda_model, "lda_model.rds") # # # Save plots # ggplot2::ggsave("topic_terms.png", plot_terms, # width = 12, height = 8, dpi = 300) # ggplot2::ggsave("topic_trends.png", plot_trends, # width = 12, height = 6, dpi = 300) # # # Export document-topic assignments # topics <- topicmodels::topics(lda_model, 1) # papers$dominant_topic <- paste0("Topic_", topics) # write.csv(papers, "papers_with_topics.csv", row.names = FALSE) # # # Export topic-term matrix # beta <- topicmodels::posterior(lda_model)$terms # write.csv(beta, "topic_term_matrix.csv") ## ----case-study--------------------------------------------------------------- # # Comprehensive search query # query_case <- paste0( # 'TITLE-ABS-KEY(', # '("sports analytics" OR "sports data science" OR "sports informatics" OR ', # '"performance analysis" OR "match analysis") ', # 'AND ("data" OR "analytics" OR "statistics" OR "modeling")', # ') ', # 'AND DOCTYPE(ar OR re) ', # 'AND PUBYEAR > 2013 ', # 'AND LANGUAGE(english)' # ) # # # Retrieve papers # papers_case <- sm_search_scopus(query_case, max_count = 500, verbose = TRUE) # # # Full preprocessing pipeline # processed_case <- sm_preprocess_text(papers_case, text_col = "abstract") # dtm_case <- sm_create_dtm(processed_case, min_term_freq = 5, max_term_freq = 0.4) # # # Model selection # k_case <- sm_select_optimal_k(dtm_case, k_range = seq(6, 18, by = 2), plot = TRUE) # # # Train final model # model_case <- sm_train_lda(dtm_case, k = k_case$optimal_k, # iter = 2000, seed = 1729) # # # Visualizations # terms_plot <- sm_plot_topic_terms(model_case, n_terms = 12) # trends_plot <- sm_plot_topic_trends(model_case, dtm_case, papers_case) ## ----benchmarks--------------------------------------------------------------- # # Test on varying document sizes # sizes <- c(100, 500, 1000, 2000) # times <- sapply(sizes, function(n) { # subset_dtm <- dtm_case[1:min(n, dtm_case$nrow), ] # system.time({ # sm_train_lda(subset_dtm, k = 10, iter = 1000) # })["elapsed"] # }) # # # Display results # data.frame(documents = sizes, time_seconds = times) ## ----reproducibility---------------------------------------------------------- # sm_train_lda(dtm, k = 10, seed = 1729) # sm_compare_models(dtm, k = 10, seed = 1729) ## ----custom-viz--------------------------------------------------------------- # library(ggplot2) # # # Customize theme parameters # plot_terms + theme_sportminer(base_size = 14, grid = FALSE) ## ----session-info------------------------------------------------------------- # sessionInfo()