## ----setup, echo=FALSE, warning=FALSE----------------------------------------- knitr::opts_chunk$set( echo = TRUE, eval = TRUE, warning = FALSE, message = FALSE, collapse = TRUE, comment = "#>", fig.width = 7, fig.height = 5, warning = FALSE, message = FALSE ) suppressPackageStartupMessages(library(rPDBapi)) suppressPackageStartupMessages(library(dplyr)) have_r3dmol <- requireNamespace("r3dmol", quietly = TRUE) have_shiny <- requireNamespace("shiny", quietly = TRUE) selected_entry <- "4HHB" quietly <- function(expr) suppressMessages(eval.parent(substitute(expr))) ## ----installation, eval = FALSE----------------------------------------------- # install.packages("rPDBapi") # # # Development version # remotes::install_github("selcukorkmaz/rPDBapi") ## ----libraries---------------------------------------------------------------- suppressPackageStartupMessages(library(rPDBapi)) suppressPackageStartupMessages(library(dplyr)) ## ----concepts----------------------------------------------------------------- kinase_full_text <- DefaultOperator("protein kinase") high_resolution <- RangeOperator( attribute = "rcsb_entry_info.resolution_combined", from_value = 0, to_value = 2.5 ) xray_method <- ExactMatchOperator( attribute = "exptl.method", value = "X-RAY DIFFRACTION" ) kinase_query <- QueryGroup( queries = list(kinase_full_text, xray_method, high_resolution), logical_operator = "AND" ) kinase_query ## ----request-options---------------------------------------------------------- search_controls <- RequestOptions( result_start_index = 0, num_results = 10, sort_by = "score", desc = TRUE ) search_controls ## ----identifier-helpers------------------------------------------------------- example_ids <- c("4HHB", "4HHB-1", "4HHB_1", "4HHB.A", "ATP") dplyr::tibble( id = example_ids, inferred_type = infer_id_type(example_ids) ) parse_rcsb_id("4HHB-1") build_entry_id(" 4HHB ") build_assembly_id("4HHB", 1) build_entity_id("4HHB", 1) build_instance_id("4HHB", "A") ## ----simple-search, eval = TRUE----------------------------------------------- kinase_hits <- query_search("protein kinase") head(kinase_hits, 10) class(kinase_hits) attr(kinase_hits, "return_type") ## ----advanced-search, eval = TRUE--------------------------------------------- kinase_entry_ids <- perform_search( search_operator = kinase_query, return_type = "ENTRY", request_options = search_controls, verbosity = FALSE ) kinase_entry_ids class(kinase_entry_ids) ## ----entry-properties--------------------------------------------------------- entry_properties <- list( rcsb_id = list(), struct = c("title"), struct_keywords = c("pdbx_keywords"), exptl = c("method"), rcsb_entry_info = c("molecular_weight", "resolution_combined"), rcsb_accession_info = c("initial_release_date") ) entry_properties ## ----schema-aware-properties-------------------------------------------------- head(list_rcsb_fields("ENTRY"), 10) search_rcsb_fields("resolution", data_type = "ENTRY") validate_properties( properties = entry_properties, data_type = "ENTRY", strict = TRUE ) validate_properties( properties = list( rcsb_entry_info = c("resolution_combined", "unknown_subfield") ), data_type = "ENTRY", strict = FALSE ) ## ----strict-validation-pattern, eval = TRUE----------------------------------- old_opt <- options(rPDBapi.strict_property_validation = TRUE) on.exit(options(old_opt), add = TRUE) generate_json_query( ids = c("4HHB"), data_type = "ENTRY", properties = list(rcsb_entry_info = c("resolution_combined")) ) ## ----entry-metadata, eval = TRUE---------------------------------------------- kinase_metadata <- data_fetcher( id = kinase_entry_ids[1:5], data_type = "ENTRY", properties = entry_properties, return_as_dataframe = TRUE ) kinase_metadata ## ----raw-query, eval = TRUE--------------------------------------------------- kinase_json_query <- generate_json_query( ids = kinase_entry_ids[1:3], data_type = "ENTRY", properties = entry_properties ) cat(kinase_json_query) ## ----raw-response, eval = TRUE------------------------------------------------ kinase_raw <- fetch_data( json_query = kinase_json_query, data_type = "ENTRY", ids = kinase_entry_ids[1:3] ) str(kinase_raw, max.level = 2) ## ----tidy-conversion, eval = TRUE--------------------------------------------- kinase_tidy <- return_data_as_dataframe( response = kinase_raw, data_type = "ENTRY", ids = kinase_entry_ids[1:3] ) kinase_tidy ## ----batch-fetch, eval = TRUE------------------------------------------------- cache_dir <- file.path(tempdir(), "rpdbapi-vignette-cache") kinase_batch <- data_fetcher_batch( id = kinase_entry_ids[1:5], data_type = "ENTRY", properties = entry_properties, return_as_dataframe = TRUE, batch_size = 2, retry_attempts = 2, retry_backoff = 0, cache = TRUE, cache_dir = cache_dir, progress = FALSE, verbosity = FALSE ) kinase_batch attr(kinase_batch, "provenance") cache_info(cache_dir) ## ----clear-cache, eval = TRUE------------------------------------------------- clear_rpdbapi_cache(cache_dir) cache_info(cache_dir) ## ----batch-strategy, eval = TRUE---------------------------------------------- # Use data_fetcher() when: # - the ID set is small # - you want the simplest request path # - retry, cache, and provenance are unnecessary # Use data_fetcher_batch() when: # - the ID set is large # - requests may need retries # - repeated retrieval should reuse cached results # - you want an explicit provenance record ## ----provenance-interpretation, eval = TRUE----------------------------------- provenance_tbl <- dplyr::tibble( field = names(attr(kinase_batch, "provenance")), value = vapply( attr(kinase_batch, "provenance"), function(x) { if (is.list(x)) "" else as.character(x) }, character(1) ) ) provenance_tbl ## ----assembly-search, eval = TRUE--------------------------------------------- kinase_assembly_ids <- perform_search( search_operator = kinase_query, return_type = "ASSEMBLY", request_options = RequestOptions(result_start_index = 0, num_results = 5), verbosity = FALSE ) kinase_assembly_ids ## ----assembly-metadata, eval = TRUE------------------------------------------- assembly_properties <- list( rcsb_id = list(), pdbx_struct_assembly = c("details", "method_details", "oligomeric_count"), rcsb_struct_symmetry = c("kind", "symbol") ) kinase_assemblies <- data_fetcher( id = kinase_assembly_ids, data_type = "ASSEMBLY", properties = assembly_properties, return_as_dataframe = TRUE ) kinase_assemblies ## ----assembly-objects, eval = TRUE-------------------------------------------- assembly_object <- as_rpdb_assembly( kinase_assemblies, metadata = list(query = "protein kinase assemblies") ) assembly_object dplyr::as_tibble(assembly_object) summarize_assemblies(assembly_object) ## ----identifier-aware-retrieval----------------------------------------------- dplyr::tibble( example_id = c("4HHB", "4HHB-1", "4HHB_1", "4HHB.A", "ATP"), inferred_type = infer_id_type(c("4HHB", "4HHB-1", "4HHB_1", "4HHB.A", "ATP")) ) parse_rcsb_id("4HHB.A") ## ----identifier-aware-fetch, eval = TRUE-------------------------------------- # Entry-level retrieval data_fetcher( id = build_entry_id("4HHB"), data_type = "ENTRY", properties = list(rcsb_id = list()) ) # Assembly-level retrieval data_fetcher( id = build_assembly_id("4HHB", 1), data_type = "ASSEMBLY", properties = list(rcsb_id = list()) ) # Polymer-entity retrieval data_fetcher( id = build_entity_id("4HHB", 1), data_type = "POLYMER_ENTITY", properties = list(rcsb_id = list()) ) ## ----polymer-search, eval = TRUE---------------------------------------------- kinase_polymer_ids <- perform_search( search_operator = kinase_query, return_type = "POLYMER_ENTITY", request_options = RequestOptions(result_start_index = 0, num_results = 5), verbosity = FALSE ) kinase_polymer_ids ## ----polymer-metadata, eval = TRUE-------------------------------------------- polymer_properties <- list( rcsb_id = list(), rcsb_entity_source_organism = c("ncbi_taxonomy_id", "ncbi_scientific_name"), rcsb_cluster_membership = c("cluster_id", "identity") ) kinase_polymer_metadata <- data_fetcher( id = kinase_polymer_ids, data_type = "POLYMER_ENTITY", properties = polymer_properties, return_as_dataframe = TRUE ) kinase_polymer_metadata ## ----taxonomy-extraction, eval = TRUE----------------------------------------- polymer_object <- as_rpdb_polymer_entity( kinase_polymer_metadata, metadata = list(query = "kinase polymer entities") ) taxonomy_table <- extract_taxonomy_table(polymer_object) taxonomy_table taxonomy_table %>% count(ncbi_scientific_name, sort = TRUE) ## ----entry-detail, eval = TRUE------------------------------------------------ selected_entry <- kinase_entry_ids[[1]] selected_info <- quietly(get_info(selected_entry)) entry_summary <- dplyr::tibble( rcsb_id = selected_entry, title = purrr::pluck(selected_info, "struct", "title", .default = NA_character_), keywords = purrr::pluck(selected_info, "struct_keywords", "pdbx_keywords", .default = NA_character_), method = purrr::pluck(selected_info, "exptl", 1, "method", .default = NA_character_), citation_title = purrr::pluck(selected_info, "rcsb_primary_citation", "title", .default = NA_character_), resolution = paste( purrr::pluck(selected_info, "rcsb_entry_info", "resolution_combined", .default = NA), collapse = "; " ) ) entry_summary ## ----literature-links, eval = TRUE, warning=FALSE----------------------------- if (!exists("selected_entry", inherits = TRUE) || !nzchar(selected_entry)) { selected_entry <- "4HHB" } literature_term <- selected_entry kinase_papers <- quietly(find_papers(literature_term, max_results = 3)) kinase_keywords <- quietly(find_results(literature_term, field = "struct_keywords")) kinase_papers head(kinase_keywords, 3) ## ----coordinates, eval = TRUE------------------------------------------------- kinase_structure <- get_pdb_file( pdb_id = selected_entry, filetype = "cif", verbosity = FALSE ) coordinate_matrix <- matrix(kinase_structure$xyz, ncol = 3, byrow = TRUE) coordinate_df <- data.frame( x = coordinate_matrix[, 1], y = coordinate_matrix[, 2], z = coordinate_matrix[, 3] ) calpha_atoms <- cbind( kinase_structure$atom[kinase_structure$calpha, c("chain", "resno", "resid")], coordinate_df[kinase_structure$calpha, , drop = FALSE] ) head(calpha_atoms, 10) ## ----calpha-helper, eval = TRUE----------------------------------------------- calpha_atoms <- extract_calpha_coordinates(kinase_structure) head(calpha_atoms, 10) ## ----fasta, eval = TRUE------------------------------------------------------- kinase_sequences <- get_fasta_from_rcsb_entry(selected_entry, verbosity = FALSE) length(kinase_sequences) utils::head(nchar(unlist(kinase_sequences))) ## ----structure-sequence-join, eval = TRUE------------------------------------- chain_sequence_summary <- join_structure_sequence( kinase_structure, kinase_sequences ) chain_sequence_summary ## ----object-model-local------------------------------------------------------- entry_demo <- as_rpdb_entry( data.frame( rcsb_id = c("4HHB", "1CRN"), method = c("X-RAY DIFFRACTION", "SOLUTION NMR"), resolution_combined = c("1.74", NA), stringsAsFactors = FALSE ), metadata = list(example = "local object demo") ) entry_demo dplyr::as_tibble(entry_demo) summarize_entries(entry_demo) entry_demo$metadata ## ----structure-object-local--------------------------------------------------- structure_demo <- as_rpdb_structure( list( atom = data.frame( chain = c("A", "A"), resno = c(1L, 2L), resid = c("GLY", "ALA"), stringsAsFactors = FALSE ), xyz = c(1, 2, 3, 4, 5, 6), calpha = c(TRUE, FALSE) ), metadata = list(source = "illustration") ) structure_demo dplyr::as_tibble(structure_demo) ## ----downstream-analysis, eval = TRUE----------------------------------------- entry_object <- as_rpdb_entry( kinase_metadata, metadata = list(query = "protein kinase entry metadata") ) summarize_entries(entry_object) kinase_summary <- dplyr::as_tibble(entry_object) %>% mutate( molecular_weight = as.numeric(molecular_weight), resolution_combined = as.numeric(resolution_combined), initial_release_date = as.Date(initial_release_date) ) %>% arrange(resolution_combined) %>% select( rcsb_id, title, pdbx_keywords, method, molecular_weight, resolution_combined, initial_release_date ) kinase_summary kinase_summary %>% summarise( n_structures = n(), median_molecular_weight = median(molecular_weight, na.rm = TRUE), best_resolution = min(resolution_combined, na.rm = TRUE) ) ## ----taxonomy-summary, eval = TRUE-------------------------------------------- kinase_polymer_metadata %>% count(ncbi_scientific_name, sort = TRUE) ## ----r3dmol-view, eval = have_r3dmol && have_shiny---------------------------- r3d <- asNamespace("r3dmol") visualization_entry <- "4HHB" saved_structure <- quietly(get_pdb_file( pdb_id = visualization_entry, filetype = "pdb", save = TRUE, path = tempdir(), verbosity = FALSE )) r3d$r3dmol() %>% r3d$m_add_model(data = saved_structure$path, format = "pdb") %>% r3d$m_set_style(style = r3d$m_style_cartoon(color = "spectrum")) %>% r3d$m_zoom_to() ## ----sequence-operator-------------------------------------------------------- kinase_motif_sequence <- "VAIKTLKPGTMSPEAFLQEAQVMKKLRHEKLVQLYAVV" sequence_operator <- SequenceOperator( sequence = kinase_motif_sequence, sequence_type = "PROTEIN", evalue_cutoff = 10, identity_cutoff = 0.7 ) sequence_operator autoresolve_sequence_type("ATGCGTACGTAGC") autoresolve_sequence_type("AUGCGUACGUAGC") ## ----sequence-search, eval = TRUE--------------------------------------------- sequence_hits <- perform_search( search_operator = sequence_operator, return_type = "POLYMER_ENTITY", request_options = RequestOptions(result_start_index = 0, num_results = 5), verbosity = FALSE ) sequence_hits ## ----seqmotif-operator-------------------------------------------------------- prosite_like_motif <- SeqMotifOperator( pattern = "[LIV][ACDEFGHIKLMNPQRSTVWY]K[GST]", sequence_type = "PROTEIN", pattern_type = "REGEX" ) prosite_like_motif ## ----seqmotif-search, eval = TRUE--------------------------------------------- motif_hits <- perform_search( search_operator = prosite_like_motif, return_type = "POLYMER_ENTITY", request_options = RequestOptions(result_start_index = 0, num_results = 5), verbosity = FALSE ) motif_hits ## ----structure-operator------------------------------------------------------- structure_operator <- StructureOperator( pdb_entry_id = "4HHB", assembly_id = 1, search_mode = "RELAXED_SHAPE_MATCH" ) structure_operator infer_search_service(structure_operator) ## ----structure-search, eval = TRUE-------------------------------------------- structure_hits <- perform_search( search_operator = QueryNode(structure_operator), return_type = "ASSEMBLY", request_options = RequestOptions(result_start_index = 0, num_results = 5), verbosity = FALSE ) structure_hits ## ----chemical-operator-------------------------------------------------------- atp_like_operator <- ChemicalOperator( descriptor = "O=P(O)(O)OP(=O)(O)OP(=O)(O)O", matching_criterion = "fingerprint-similarity" ) atp_like_operator infer_search_service(atp_like_operator) ## ----chemical-search, eval = TRUE--------------------------------------------- chemical_hits <- perform_search( search_operator = QueryNode(atp_like_operator), return_type = "CHEMICAL_COMPONENT", request_options = RequestOptions(result_start_index = 0, num_results = 5), verbosity = FALSE ) chemical_hits ## ----operator-reference------------------------------------------------------- exact_resolution <- ExactMatchOperator( attribute = "exptl.method", value = "X-RAY DIFFRACTION" ) organism_inclusion <- InOperator( attribute = "rcsb_entity_source_organism.taxonomy_lineage.name", value = c("Homo sapiens", "Mus musculus") ) title_words <- ContainsWordsOperator( attribute = "struct.title", value = "protein kinase" ) title_phrase <- ContainsPhraseOperator( attribute = "struct.title", value = "protein kinase" ) resolution_cutoff <- ComparisonOperator( attribute = "rcsb_entry_info.resolution_combined", value = 2.0, comparison_type = "LESS" ) resolution_window <- RangeOperator( attribute = "rcsb_entry_info.resolution_combined", from_value = 1.0, to_value = 2.5 ) doi_exists <- ExistsOperator("rcsb_primary_citation.pdbx_database_id_doi") list( exact_resolution = exact_resolution, organism_inclusion = organism_inclusion, title_words = title_words, title_phrase = title_phrase, resolution_cutoff = resolution_cutoff, resolution_window = resolution_window, doi_exists = doi_exists ) ## ----querynode-scoredresult--------------------------------------------------- operator_node <- QueryNode(title_words) composite_query <- QueryGroup( queries = list(title_words, resolution_window, doi_exists), logical_operator = "AND" ) scored_example <- ScoredResult(entity_id = "4HHB", score = 0.98) operator_node composite_query scored_example ## ----scored-search-results, eval = TRUE--------------------------------------- scored_structure_hits <- perform_search( search_operator = QueryNode(structure_operator), return_type = "ASSEMBLY", request_options = RequestOptions(result_start_index = 0, num_results = 3), return_with_scores = TRUE, verbosity = FALSE ) scored_structure_hits class(scored_structure_hits) ## ----query-composition-strategy, eval = TRUE---------------------------------- # Pattern: build small reusable operators first title_filter <- ContainsPhraseOperator("struct.title", "protein kinase") resolution_filter <- ComparisonOperator( "rcsb_entry_info.resolution_combined", 2.5, "LESS_OR_EQUAL" ) # Combine them only when the biological question is clear query_graph <- QueryGroup( queries = list( title_filter, resolution_filter ), logical_operator = "AND" ) ## ----query-search-variants, eval = TRUE--------------------------------------- # PubMed-linked structures query_search(search_term = 27499440, query_type = "PubmedIdQuery") # Organism/taxonomy search organism_search <- query_search(search_term = "9606", query_type = "TreeEntityQuery") head(organism_search) # Experimental method search experimental_search <- query_search(search_term = "X-RAY DIFFRACTION", query_type = "ExpTypeQuery") head(experimental_search) # Author search query_search(search_term = "Kuriyan, J.", query_type = "AdvancedAuthorQuery") # UniProt-linked entries query_search(search_term = "P31749", query_type = "uniprot") # PFAM-linked entries pfam_search <- query_search(search_term = "PF00069", query_type = "pfam") head(pfam_search) ## ----scan-params-example------------------------------------------------------ custom_scan_params <- list( request_options = list( paginate = list(start = 0, rows = 5), return_all_hits = FALSE ) ) custom_scan_params ## ----query-search-scan-params, eval = TRUE------------------------------------ limited_kinase_hits <- query_search( search_term = "protein kinase", scan_params = custom_scan_params ) limited_kinase_hits ## ----add-property------------------------------------------------------------- base_properties <- list( rcsb_entry_info = c("resolution_combined"), exptl = c("method") ) extended_properties <- add_property(list( rcsb_entry_info = c("molecular_weight", "resolution_combined"), struct = c("title") )) base_properties extended_properties ## ----property-design-pattern-------------------------------------------------- property_workflow <- add_property(list( rcsb_id = list(), struct = c("title"), rcsb_entry_info = c("resolution_combined") )) property_workflow <- add_property(list( rcsb_entry_info = c("molecular_weight", "resolution_combined"), exptl = c("method") )) property_workflow validate_properties(property_workflow, data_type = "ENTRY", strict = FALSE) ## ----ligand-component-properties---------------------------------------------- ligand_properties <- list( rcsb_id = list(), chem_comp = c("id", "name", "formula", "formula_weight", "type"), rcsb_chem_comp_info = c("initial_release_date") ) ligand_properties ## ----chemical-component-fetch, eval = TRUE------------------------------------ chemical_component_df <- data_fetcher( id = head(chemical_hits, 3), data_type = "CHEMICAL_COMPONENT", properties = ligand_properties, return_as_dataframe = TRUE ) chemical_component_df ## ----ligand-object-helper, eval = TRUE---------------------------------------- ligand_object <- as_rpdb_chemical_component( chemical_component_df, metadata = list(query = "ATP-like chemical components") ) extract_ligand_table(ligand_object) ## ----describe-chemical, eval = TRUE------------------------------------------- atp_description <- quietly(describe_chemical("ATP")) dplyr::tibble( chem_id = "ATP", name = purrr::pluck(atp_description, "chem_comp", "name", .default = NA_character_), formula = purrr::pluck(atp_description, "chem_comp", "formula", .default = NA_character_), formula_weight = purrr::pluck(atp_description, "chem_comp", "formula_weight", .default = NA), smiles = purrr::pluck(atp_description, "rcsb_chem_comp_descriptor", "smiles", .default = NA_character_) ) ## ----instance-level-examples, eval = TRUE------------------------------------- # Polymer chain instance polymer_instance <- data_fetcher( id = "4HHB.A", data_type = "POLYMER_ENTITY_INSTANCE", properties = list(rcsb_id = list()), return_as_dataframe = TRUE, verbosity = FALSE ) # Non-polymer instance (heme in hemoglobin entry 4HHB) nonpolymer_instance <- data_fetcher( id = "4HHB.E", data_type = "NONPOLYMER_ENTITY_INSTANCE", properties = list(rcsb_id = list()), return_as_dataframe = TRUE, verbosity = FALSE ) polymer_instance nonpolymer_instance ## ----low-level-url------------------------------------------------------------ entry_url <- get_pdb_api_url("core/entry/", "4HHB") chem_url <- get_pdb_api_url("core/chemcomp/", "ATP") entry_url chem_url ## ----low-level-lifecycle, eval = TRUE----------------------------------------- # Manual request lifecycle url <- get_pdb_api_url("core/entry/", "4HHB") response <- send_api_request(url, verbosity = FALSE) handle_api_errors(response, url) payload <- parse_response(response, format = "json") ## ----low-level-http, eval = TRUE---------------------------------------------- entry_response <- send_api_request(entry_url, verbosity = FALSE) handle_api_errors(entry_response, entry_url) entry_payload <- parse_response(entry_response, format = "json") names(entry_payload)[1:5] ## ----graphql-low-level, eval = TRUE------------------------------------------- mini_graphql <- generate_json_query( ids = kinase_entry_ids[1:2], data_type = "ENTRY", properties = list(rcsb_id = list(), struct = c("title")) ) mini_graphql_response <- search_graphql(list(query = mini_graphql)) str(mini_graphql_response, max.level = 2) ## ----contracts-live, eval = TRUE---------------------------------------------- list( query_search_class = class(query_search("kinase")), perform_search_class = class( perform_search(DefaultOperator("kinase"), verbosity = FALSE) ), perform_search_scores_class = class( perform_search( DefaultOperator("kinase"), return_with_scores = TRUE, verbosity = FALSE ) ) ) ## ----fetch-contracts, eval = TRUE--------------------------------------------- raw_entry_response <- data_fetcher( id = kinase_entry_ids[1:2], data_type = "ENTRY", properties = list(rcsb_id = list()), return_as_dataframe = FALSE ) tidy_entry_response <- data_fetcher( id = kinase_entry_ids[1:2], data_type = "ENTRY", properties = list(rcsb_id = list()), return_as_dataframe = TRUE ) class(raw_entry_response) class(tidy_entry_response) ## ----object-contracts, eval = TRUE-------------------------------------------- list( entry_object_class = class(as_rpdb_entry(kinase_metadata)), assembly_object_class = class(as_rpdb_assembly(kinase_assemblies)), polymer_object_class = class(as_rpdb_polymer_entity(kinase_polymer_metadata)), structure_object_class = class(as_rpdb_structure(kinase_structure)), batch_provenance_names = names(attr(kinase_batch, "provenance")) ) ## ----object-methods-local----------------------------------------------------- local_entry_object <- as_rpdb_entry( data.frame( rcsb_id = "4HHB", method = "X-RAY DIFFRACTION", resolution_combined = "1.74", stringsAsFactors = FALSE ), metadata = list(source = "local method demo") ) print(local_entry_object) dplyr::as_tibble(local_entry_object) ## ----defensive-patterns------------------------------------------------------- invalid_property_result <- tryCatch( validate_properties( properties = list(unknown_field = c("x")), data_type = "ENTRY", strict = TRUE ), rPDBapi_error_invalid_input = function(e) e ) invalid_fetch_result <- tryCatch( data_fetcher( id = character(0), data_type = "ENTRY", properties = list(rcsb_id = list()) ), rPDBapi_error_invalid_input = function(e) e ) list( invalid_property_class = class(invalid_property_result), invalid_property_message = conditionMessage(invalid_property_result), invalid_fetch_class = class(invalid_fetch_result), invalid_fetch_message = conditionMessage(invalid_fetch_result) ) ## ----export-reference, results = "asis", echo=FALSE--------------------------- export_reference <- data.frame( Function = c( "query_search", "perform_search", "DefaultOperator", "ExactMatchOperator", "InOperator", "ContainsWordsOperator", "ContainsPhraseOperator", "ComparisonOperator", "RangeOperator", "ExistsOperator", "SequenceOperator", "autoresolve_sequence_type", "SeqMotifOperator", "StructureOperator", "ChemicalOperator", "QueryNode", "QueryGroup", "RequestOptions", "ScoredResult", "infer_search_service", "infer_id_type", "parse_rcsb_id", "build_entry_id", "build_assembly_id", "build_entity_id", "build_instance_id", "add_property", "list_rcsb_fields", "search_rcsb_fields", "validate_properties", "generate_json_query", "search_graphql", "fetch_data", "return_data_as_dataframe", "data_fetcher", "data_fetcher_batch", "cache_info", "clear_rpdbapi_cache", "get_info", "find_results", "find_papers", "describe_chemical", "get_fasta_from_rcsb_entry", "get_pdb_file", "get_pdb_api_url", "send_api_request", "handle_api_errors", "parse_response", "as_rpdb_entry", "as_rpdb_assembly", "as_rpdb_polymer_entity", "as_rpdb_chemical_component", "as_rpdb_structure", "summarize_entries", "summarize_assemblies", "extract_taxonomy_table", "extract_ligand_table", "extract_calpha_coordinates", "join_structure_sequence" ), Role = c( "High-level convenience search helper", "Operator-based search engine", "Full-text search operator", "Exact attribute match operator", "Set-membership operator", "Word containment operator", "Phrase containment operator", "Numeric/date comparison operator", "Range filter operator", "Attribute existence operator", "Sequence similarity search operator", "Automatic DNA/RNA/protein detection", "Sequence motif search operator", "Structure similarity search operator", "Chemical descriptor search operator", "Wrap one operator as a query node", "Combine nodes with AND/OR logic", "Pagination and sorting controls", "Represent a scored hit", "Infer backend service from operator", "Infer identifier level from an ID string", "Parse an identifier into structured components", "Normalize or build entry identifiers", "Build assembly identifiers", "Build entity identifiers", "Build instance or chain identifiers", "Merge/extend GraphQL property lists", "List known retrievable fields by data type", "Search the built-in field registry", "Validate a property list against the field registry", "Build a GraphQL query string", "Low-level GraphQL request helper", "Normalize validated GraphQL payloads", "Flatten nested payloads into data frames", "High-level metadata fetcher", "Batch metadata fetcher with retry and provenance", "Inspect batch-cache contents", "Clear on-disk cache entries", "Retrieve full entry metadata", "Extract one field across search hits", "Extract primary citation titles", "Retrieve ligand/chemical-component details", "Retrieve FASTA sequences", "Download and parse structure files", "Build REST endpoint URLs", "Send low-level GET/POST requests", "Check HTTP status and stop on error", "Parse JSON or text responses", "Wrap entry data in a typed object", "Wrap assembly data in a typed object", "Wrap polymer-entity data in a typed object", "Wrap chemical-component data in a typed object", "Wrap structure data in a typed object", "Summarize entry-level metadata", "Summarize assembly-level metadata", "Extract taxonomy-focused columns", "Extract ligand-focused columns", "Extract C-alpha coordinates", "Join sequence summaries to chain coordinates" ), stringsAsFactors = FALSE ) knitr::kable(export_reference, align = c("l", "l")) ## ----every-export-pattern, eval = TRUE, echo=TRUE----------------------------- # Search helpers query_search("4HHB") perform_search(DefaultOperator("4HHB"), verbosity = FALSE) # Text and attribute operators DefaultOperator("kinase") ExactMatchOperator("exptl.method", "X-RAY DIFFRACTION") InOperator("rcsb_entity_source_organism.taxonomy_lineage.name", c("Homo sapiens", "Mus musculus")) ContainsWordsOperator("struct.title", "protein kinase") ContainsPhraseOperator("struct.title", "protein kinase") ComparisonOperator("rcsb_entry_info.resolution_combined", 2.0, "LESS") RangeOperator("rcsb_entry_info.resolution_combined", 1.0, 2.5) ExistsOperator("rcsb_primary_citation.pdbx_database_id_doi") # Specialized operators SequenceOperator("MVLSPADKTNVKAAW", sequence_type = "PROTEIN") autoresolve_sequence_type("ATGCGTACGTAGC") SeqMotifOperator("[LIV][ACDEFGHIKLMNPQRSTVWY]K[GST]", "PROTEIN", "REGEX") StructureOperator("4HHB", assembly_id = 1, search_mode = "RELAXED_SHAPE_MATCH") ChemicalOperator("C1=CC=CC=C1", matching_criterion = "graph-strict") # Query composition QueryNode(DefaultOperator("kinase")) QueryGroup(list(DefaultOperator("kinase"), ExistsOperator("rcsb_primary_citation.title")), "AND") RequestOptions(result_start_index = 0, num_results = 10) ScoredResult("4HHB", 0.98) infer_search_service(StructureOperator("4HHB")) infer_id_type(c("4HHB", "4HHB-1", "4HHB_1", "4HHB.A", "ATP")) parse_rcsb_id("4HHB-1") build_entry_id("4HHB") build_assembly_id("4HHB", 1) build_entity_id("4HHB", 1) build_instance_id("4HHB", "A") # Metadata helpers add_property(list(rcsb_entry_info = c("resolution_combined"))) list_rcsb_fields("ENTRY") search_rcsb_fields("resolution", data_type = "ENTRY") validate_properties( list(rcsb_id = list(), rcsb_entry_info = c("resolution_combined")), data_type = "ENTRY", strict = TRUE ) generate_json_query(c("4HHB"), "ENTRY", list(rcsb_id = list(), struct = c("title"))) search_graphql(list(query = generate_json_query(c("4HHB"), "ENTRY", list(rcsb_id = list())))) fetch_data(generate_json_query(c("4HHB"), "ENTRY", list(rcsb_id = list())), "ENTRY", "4HHB") return_data_as_dataframe( fetch_data(generate_json_query(c("4HHB"), "ENTRY", list(rcsb_id = list())), "ENTRY", "4HHB"), "ENTRY", "4HHB" ) data_fetcher("4HHB", "ENTRY", list(rcsb_id = list(), struct = c("title"))) data_fetcher_batch( c("4HHB", "1CRN"), "ENTRY", list(rcsb_id = list(), struct = c("title")), batch_size = 1, cache = FALSE ) cache_info() clear_rpdbapi_cache() quietly(get_info("4HHB")) quietly(find_results("4HHB", field = "struct_keywords")) quietly(find_papers("4HHB", max_results = 3)) describe_chemical("ATP") get_fasta_from_rcsb_entry("4HHB") # Files and low-level HTTP get_pdb_file("4HHB", filetype = "cif", verbosity = FALSE) get_pdb_api_url("core/entry/", "4HHB") resp <- send_api_request(get_pdb_api_url("core/entry/", "4HHB"), verbosity = FALSE) handle_api_errors(resp, get_pdb_api_url("core/entry/", "4HHB")) parse_response(resp, format = "json") # Object wrappers and analysis helpers as_rpdb_entry(data.frame(rcsb_id = "4HHB")) as_rpdb_assembly(data.frame(rcsb_id = "4HHB-1")) as_rpdb_polymer_entity(data.frame(rcsb_id = "4HHB_1")) as_rpdb_chemical_component(data.frame(rcsb_id = "ATP")) as_rpdb_structure(get_pdb_file("4HHB", filetype = "cif", verbosity = FALSE)) summarize_entries(data.frame(method = "X-RAY DIFFRACTION", resolution_combined = "1.8")) summarize_assemblies(data.frame(oligomeric_count = "2", symbol = "C2")) extract_taxonomy_table(data.frame(rcsb_id = "4HHB_1", ncbi_taxonomy_id = "9606")) extract_ligand_table(data.frame(rcsb_id = "ATP", formula_weight = "507.18")) extract_calpha_coordinates(get_pdb_file("4HHB", filetype = "cif", verbosity = FALSE)) join_structure_sequence( get_pdb_file("4HHB", filetype = "cif", verbosity = FALSE), get_fasta_from_rcsb_entry("4HHB") ) ## ----id-format-table, results = "asis", echo=FALSE---------------------------- id_reference <- data.frame( Data_or_Return_Type = c( "ENTRY", "ASSEMBLY", "POLYMER_ENTITY", "BRANCHED_ENTITY", "NONPOLYMER_ENTITY", "POLYMER_ENTITY_INSTANCE", "BRANCHED_ENTITY_INSTANCE", "NONPOLYMER_ENTITY_INSTANCE", "CHEMICAL_COMPONENT" ), Typical_ID_Format = c( "4-character PDB ID, e.g. 4HHB", "Entry plus assembly ID, e.g. 4HHB-1", "Entry plus entity ID, e.g. 4HHB_1", "Entry plus branched entity ID", "Entry plus nonpolymer entity ID, e.g. 3PQR_5", "Instance or chain-level identifier, endpoint-specific", "Instance-level identifier, endpoint-specific", "Instance-level identifier, endpoint-specific", "Chemical component ID, e.g. ATP" ), Typical_Use = c( "Whole-structure metadata", "Biological assembly and symmetry", "Entity-level taxonomy or sequence annotations", "Glycan/branched entity records", "Ligand records within structures", "Chain-specific annotations", "Branched entity instance records", "Ligand instance records", "Ligand chemistry and descriptors" ), stringsAsFactors = FALSE ) knitr::kable(id_reference, align = c("l", "l", "l")) ## ----return-contract-table, results = "asis", echo=FALSE---------------------- contract_reference <- data.frame( Function = c( "query_search(return_type = 'entry')", "query_search(other return_type)", "perform_search()", "perform_search(return_with_scores = TRUE)", "perform_search(return_raw_json_dict = TRUE)", "fetch_data()", "data_fetcher_batch(return_as_dataframe = TRUE)", "data_fetcher(return_as_dataframe = TRUE)", "data_fetcher(return_as_dataframe = FALSE)", "as_rpdb_entry()", "as_rpdb_assembly()", "as_rpdb_polymer_entity()", "as_rpdb_chemical_component()", "as_rpdb_structure()" ), Return_Class = c( "rPDBapi_query_ids", "rPDBapi_query_response", "rPDBapi_search_ids", "rPDBapi_search_scores", "rPDBapi_search_raw_response", "rPDBapi_fetch_response", "rPDBapi_dataframe", "rPDBapi_dataframe", "rPDBapi_fetch_response", "rPDBapi_entry", "rPDBapi_assembly", "rPDBapi_polymer_entity", "rPDBapi_chemical_component", "rPDBapi_structure" ), Meaning = c( "Identifier vector from query_search()", "Parsed query_search payload", "Identifier vector from perform_search()", "Scored search results", "Raw JSON-like search payload", "Validated GraphQL fetch payload", "Flattened batch result with provenance metadata", "Flattened analysis-ready table", "Nested validated fetch payload", "Typed entry wrapper around retrieved data", "Typed assembly wrapper around retrieved data", "Typed polymer-entity wrapper around retrieved data", "Typed chemical-component wrapper around retrieved data", "Typed structure wrapper around retrieved data" ), stringsAsFactors = FALSE ) knitr::kable(contract_reference, align = c("l", "l", "l")) ## ----error-guidance----------------------------------------------------------- error_guidance <- data.frame( Scenario = c( "Malformed search response", "Unsupported return-type mapping", "Invalid input to search/fetch helper", "Unknown property or subproperty in strict mode", "Batch retrieval failure after retries", "HTTP failure", "Response parsing failure" ), Typical_Class_or_Source = c( "rPDBapi_error_malformed_response", "rPDBapi_error_unsupported_mapping", "rPDBapi_error_invalid_input", "validate_properties() / generate_json_query()", "data_fetcher_batch()", "handle_api_errors() / send_api_request()", "parse_response()" ), stringsAsFactors = FALSE ) knitr::kable(error_guidance, align = c("l", "l")) ## ----reproducibility---------------------------------------------------------- analysis_manifest <- list( live_examples = TRUE, package_version = as.character(utils::packageVersion("rPDBapi")), query = kinase_query, requested_entry_fields = entry_properties, strict_property_validation = getOption("rPDBapi.strict_property_validation", FALSE), built_ids = list( entry = build_entry_id("4HHB"), assembly = build_assembly_id("4HHB", 1), entity = build_entity_id("4HHB", 1), instance = build_instance_id("4HHB", "A") ), batch_provenance_example = attr(kinase_batch, "provenance") ) str(analysis_manifest, max.level = 2) ## ----session-info------------------------------------------------------------- sessionInfo()