humanpred · billdenney · May 21, 2026 · May 21, 2026 · May 21, 2026 · May 21, 2026
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -114,6 +114,23 @@ repos:
         pass_filenames: false
         files: '^(_pkgdown\.yml|NAMESPACE|R/.*\.R)$'
 
+      - id: rd-xref-check
+        name: Rd cross-reference resolution
+        description: >
+          Catches \link{} targets in man/*.Rd that don't resolve to
+          either a topic in this package, a documented dependency, or
+          one of the base / recommended packages. This is the same
+          WARNING that `R CMD check --as-cran` emits under "checking
+          Rd cross-references" and that fails every platform of the
+          cross-platform R-CMD-check matrix. Uses the same internal R
+          function (tools:::.check_Rd_xrefs) R CMD check itself uses;
+          operates on the source tree without needing the package
+          installed - fast enough for every commit.
+        entry: Rscript tools/check-rd-xrefs.R
+        language: system
+        pass_filenames: false
+        files: '^(R/.*\.R|man/.*\.Rd|DESCRIPTION)$'
+
   # Conventional Commits message check (server side: defense in depth via CI).
   - repo: https://github.com/compilerla/conventional-pre-commit
     rev: v3.6.0

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: pdfium
 Title: Idiomatic R Bindings to the PDFium PDF Engine
-Version: 0.1.0
+Version: 0.0.9000
 Authors@R: c(
     person("Bill", "Denney", , "wdenney@humanpredictions.com",
            role = c("aut", "cre"),

diff --git a/NAMESPACE b/NAMESPACE
@@ -42,6 +42,8 @@ S3method(print,pdfium_obj_list)
 S3method(print,pdfium_page)
 S3method(print,pdfium_signature)
 S3method(print,pdfium_signature_list)
+S3method(summary,pdfium_doc)
+S3method(summary,pdfium_page)
 export(as_pdfium_annot_list)
 export(as_pdfium_attachment_list)
 export(as_pdfium_bookmark_list)
@@ -103,6 +105,7 @@ export(pdf_bookmark_title)
 export(pdf_bookmark_uri)
 export(pdf_clip_path_count)
 export(pdf_clip_path_segments)
+export(pdf_dir_summary)
 export(pdf_doc_bookmark_find)
 export(pdf_doc_bookmarks)
 export(pdf_doc_close)
@@ -117,10 +120,12 @@ export(pdf_doc_named_dest_by_name)
 export(pdf_doc_named_dests)
 export(pdf_doc_new)
 export(pdf_doc_open)
+export(pdf_doc_open_url)
 export(pdf_doc_page_mode)
 export(pdf_doc_permissions)
 export(pdf_doc_security)
 export(pdf_doc_set_language)
+export(pdf_doc_summary)
 export(pdf_doc_text)
 export(pdf_doc_trailer_ends)
 export(pdf_doc_user_permissions)
@@ -196,6 +201,7 @@ export(pdf_page_set_rotation)
 export(pdf_page_size)
 export(pdf_page_thumbnail)
 export(pdf_pages_reorder)
+export(pdf_pages_summary)
 export(pdf_parse_date)
 export(pdf_path_append)
 export(pdf_path_bezier_to)

diff --git a/NEWS.md b/NEWS.md
@@ -11,7 +11,10 @@ PDFs created with `pdf_doc_new()` are also writable).
 * `pdf_doc_open()` / `pdf_doc_close()`, `pdf_doc_new()`,
   `pdf_save()` / `pdf_save_to_raw()` — open existing PDFs (optionally
   with `readwrite = TRUE`), build new ones in memory, and persist
-  the result.
+  the result. `pdf_doc_open_url(url)` is a convenience wrapper that
+  fetches a `http://` / `https://` / `ftp://` / `file://` URL via
+  `url()` + `readBin()` and loads the bytes through PDFium's
+  in-memory path — no temporary file on disk.
 * `pdf_doc_info()`, `pdf_doc_meta()`, `pdf_doc_text()`,
   `pdf_doc_fonts()`, `pdf_doc_file_id()`, `pdf_doc_page_mode()`,
   `pdf_doc_viewer_preferences()`, `pdf_doc_viewer_preference_by_name()`,
@@ -27,6 +30,23 @@ PDFs created with `pdf_doc_new()` are also writable).
 * `pdf_page_load()` / `pdf_page_close()`, `pdf_page_size()`,
   `pdf_page_rotation()`, `pdf_page_box()`, `pdf_page_thumbnail()` —
   per-page handles and metadata.
+* `pdf_doc_summary()` and `pdf_pages_summary()` — one-call triage
+  helpers. `pdf_doc_summary()` returns a single-row tibble
+  aggregating the most-asked-for facts about a PDF (path, page
+  count, Info-dictionary metadata, feature flags, per-feature
+  counts, file-ID tuple); `pdf_pages_summary()` is the per-page
+  sibling (width / height / rotation / label, all via the fast
+  by-index PDFium readers). `summary(doc)` and `summary(page)`
+  dispatch to the matching tibble — `summary(page)` adds the
+  page-loaded counts (annotation count, page-object count,
+  text-run count, link count) since the page is already loaded.
+* `pdf_dir_summary(dir)` — scans a directory for PDF files and
+  returns one row per file in the `pdf_doc_summary()` shape.
+  Recursive scan via `recursive = TRUE`; pattern-matches `.pdf`
+  case-insensitively by default. The `errors` argument selects
+  one of `"warn"` (default — surface broken files but don't
+  abort), `"skip"` (silently drop), or `"stop"` (abort on the
+  first failure).
 
 ## Page objects, paths, and text
 

diff --git a/R/classes.R b/R/classes.R
@@ -72,6 +72,7 @@ print.pdfium_doc <- function(x, ...) {
   invisible(x)
 }
 
+
 #' Construct a `pdfium_page` from an external pointer
 #'
 #' Internal helper. The page's externalptr carries its parent document's

diff --git a/R/doc.R b/R/doc.R
@@ -506,3 +506,259 @@ pdf_doc_trailer_ends <- function(doc) {
   doc <- as_open_doc(doc)
   cpp_doc_trailer_ends(doc$ptr)
 }
+
+#' One-call summary of a PDF document
+#'
+#' Returns a single-row tibble that aggregates the most-asked-for
+#' facts about a PDF document: file path, page count, Info-dictionary
+#' metadata, structural feature flags (forms, attachments, bookmarks,
+#' signatures, JavaScript, tagged-PDF), counts for each of those
+#' feature groups, encryption state, and the file-ID tuple. Designed
+#' to replace the eight-or-so individual calls users typically chain
+#' together when triaging a PDF.
+#'
+#' Each column either exposes an existing reader or is a `length()`
+#' over the matching `pdfium_*_list`. No new C-side work — purely an
+#' R-side aggregation. See **Columns** below for the source reader
+#' for each entry.
+#'
+#' @section Columns:
+#' * `path` — character; canonical path the doc was opened from, or
+#'   `"<raw bytes>"` for in-memory loads.
+#' * `page_count`, `file_version` — from [pdf_doc_info()].
+#' * `title`, `author`, `subject`, `keywords`, `creator`, `producer`,
+#'   `creation_date`, `mod_date`, `trapped` — from [pdf_doc_info()];
+#'   missing entries appear as `""`.
+#' * `creation_date_parsed`, `mod_date_parsed` — POSIXct (UTC), `NA`
+#'   when the source date is empty or unparseable. From
+#'   [pdf_parse_date()].
+#' * `is_tagged` — from [pdf_doc_is_tagged()].
+#' * `is_encrypted` — `TRUE` when [pdf_doc_security()] returns a
+#'   non-NA revision; `FALSE` otherwise.
+#' * `security_revision` — from [pdf_doc_security()]; `NA` for
+#'   unencrypted PDFs.
+#' * `xref_valid` — from [pdf_doc_xref_valid()].
+#' * `bookmark_count`, `attachment_count`, `signature_count`,
+#'   `form_field_count`, `javascript_count`, `named_dest_count` —
+#'   `length()` of [pdf_doc_bookmarks()], [pdf_attachments()],
+#'   [pdf_signatures()], [pdf_form_fields()], [pdf_doc_javascript()],
+#'   and [pdf_doc_named_dests()] respectively. Zero when the
+#'   document has none of the corresponding entries.
+#' * `has_page_labels` — `TRUE` when [pdf_page_labels()] returns
+#'   non-NA strings.
+#' * `file_id_permanent`, `file_id_changing` — from
+#'   [pdf_doc_file_id()]; UTF-8 hex strings or `NA`.
+#'
+#' @param doc A `pdfium_doc` from [pdf_doc_open()], or a character
+#'   path.
+#' @param password Optional password for encrypted PDFs when `doc`
+#'   is a path. Ignored when `doc` is an open `pdfium_doc`.
+#' @return A one-row tibble.
+#' @seealso [pdf_doc_info()] for the Info-dictionary subset alone,
+#'   the per-feature readers listed under **Columns** for richer
+#'   per-row data.
+#' @examples
+#' fixture <- system.file("extdata", "fixtures", "annotated.pdf",
+#'   package = "pdfium"
+#' )
+#' if (nzchar(fixture)) pdf_doc_summary(fixture)
+#' @export
+pdf_doc_summary <- function(doc, password = NULL) {
+  if (is.character(doc)) {
+    handle <- pdf_doc_open(doc, password = password)
+    on.exit(pdf_doc_close(handle), add = TRUE)
+    return(pdf_doc_summary(handle))
+  }
+  checkmate::assert_class(doc, "pdfium_doc")
+  if (!is_open(doc)) stop("Document has been closed.", call. = FALSE)
+
+  info <- pdf_doc_info(doc)
+  rev <- pdf_doc_security(doc)
+  page_labels <- tryCatch(pdf_page_labels(doc),
+                          error = function(e) NULL)
+  file_id <- list(
+    permanent = file_id_hex_or_na(tryCatch(
+      pdf_doc_file_id(doc, "permanent"),
+      error = function(e) raw(0)
+    )),
+    changing = file_id_hex_or_na(tryCatch(
+      pdf_doc_file_id(doc, "changing"),
+      error = function(e) raw(0)
+    ))
+  )
+
+  tibble::tibble(
+    path                 = doc$path,
+    page_count           = info$page_count,
+    file_version         = info$file_version,
+    title                = info$title %||% "",
+    author               = info$author %||% "",
+    subject              = info$subject %||% "",
+    keywords             = info$keywords %||% "",
+    creator              = info$creator %||% "",
+    producer             = info$producer %||% "",
+    creation_date        = info$creation_date %||% "",
+    mod_date             = info$mod_date %||% "",
+    trapped              = info$trapped %||% "",
+    creation_date_parsed = info$creation_date_parsed,
+    mod_date_parsed      = info$mod_date_parsed,
+    is_tagged            = pdf_doc_is_tagged(doc),
+    is_encrypted         = !is.na(rev),
+    security_revision    = rev,
+    xref_valid           = pdf_doc_xref_valid(doc),
+    bookmark_count       = length(pdf_doc_bookmarks(doc)),
+    attachment_count     = length(pdf_attachments(doc)),
+    signature_count      = length(pdf_signatures(doc)),
+    form_field_count     = length(pdf_form_fields(doc)),
+    javascript_count     = length(pdf_doc_javascript(doc)),
+    named_dest_count     = length(pdf_doc_named_dests(doc)),
+    has_page_labels      = !is.null(page_labels) &&
+                             any(!is.na(page_labels) & nzchar(page_labels)),
+    file_id_permanent    = file_id$permanent,
+    file_id_changing     = file_id$changing
+  )
+}
+
+# Internal: tiny version of rlang's %||% so we don't pull rlang in
+# just for the summary path. Returns `b` when `a` is NULL or NA.
+`%||%` <- function(a, b) {
+  if (is.null(a) || (length(a) == 1L && is.na(a))) b else a
+}
+
+#' Document-level summary
+#'
+#' `summary()` method for `pdfium_doc`. Defers to
+#' [pdf_doc_summary()] so users can call `summary(doc)` for the
+#' single-row tibble of every key fact about the PDF — page count,
+#' Info-dictionary metadata, structural feature flags, per-feature
+#' counts, the file-ID tuple — in one call.
+#'
+#' @param object A `pdfium_doc` from [pdf_doc_open()].
+#' @param ... Unused (S3 generic compatibility).
+#' @return The tibble returned by [pdf_doc_summary()].
+#' @seealso [pdf_doc_summary()].
+#' @export
+summary.pdfium_doc <- function(object, ...) {
+  pdf_doc_summary(object)
+}
+
+#' Summarise every PDF in a directory in one call
+#'
+#' Scans a directory for PDF files and returns a tibble whose rows
+#' are the [pdf_doc_summary()] output for each file. The natural
+#' replacement for the standard "loop over a folder of PDFs and
+#' triage" workflow — encrypted-which / has-forms-which /
+#' has-attachments-which.
+#'
+#' Files that fail to open (corrupt, wrong format, password
+#' protected) are handled per the `errors` argument:
+#'
+#' * `"warn"` (default) — a `warning()` per failed file; the file
+#'   is dropped from the result tibble.
+#' * `"skip"` — silently dropped.
+#' * `"stop"` — the first failed file raises an error and the
+#'   function aborts.
+#'
+#' @param dir Character scalar. Path to the directory to scan.
+#' @param pattern Regular expression filtering filenames. Defaults
+#'   to `"\\.pdf$"` (case-insensitive).
+#' @param recursive Logical. When `TRUE`, descend into
+#'   subdirectories. Defaults `FALSE`.
+#' @param password Optional password applied to every file. `NULL`
+#'   (default) tries each file without a password. Useful when all
+#'   files share the same password.
+#' @param errors One of `"warn"`, `"skip"`, `"stop"` — see Details.
+#' @return A tibble with the same columns as [pdf_doc_summary()].
+#'   Zero rows when the directory has no PDFs (or every PDF failed
+#'   to open under `errors = "skip"` / `"warn"`).
+#' @seealso [pdf_doc_summary()] for the single-file companion.
+#' @examples
+#' fixture_dir <- system.file("extdata", "fixtures",
+#'                            package = "pdfium")
+#' if (nzchar(fixture_dir)) {
+#'   pdf_dir_summary(fixture_dir)
+#' }
+#' @export
+pdf_dir_summary <- function(dir = ".", pattern = "\\.pdf$",
+                             recursive = FALSE, password = NULL,
+                             errors = c("warn", "skip", "stop")) {
+  checkmate::assert_directory_exists(dir)
+  checkmate::assert_string(pattern)
+  checkmate::assert_flag(recursive)
+  errors <- match.arg(errors)
+
+  files <- list.files(dir, pattern = pattern, recursive = recursive,
+                       full.names = TRUE, ignore.case = TRUE)
+  if (length(files) == 0L) {
+    return(pdf_doc_summary_empty())
+  }
+
+  rows <- lapply(files, function(f) {
+    tryCatch(
+      pdf_doc_summary(f, password = password),
+      error = function(e) {
+        if (errors == "stop") {
+          stop(sprintf("pdf_dir_summary: failed to read '%s': %s",
+                       f, conditionMessage(e)), call. = FALSE)
+        }
+        if (errors == "warn") {
+          warning(sprintf("pdf_dir_summary: failed to read '%s': %s",
+                          f, conditionMessage(e)), call. = FALSE)
+        }
+        NULL
+      }
+    )
+  })
+  ok <- !vapply(rows, is.null, logical(1L))
+  if (!any(ok)) {
+    return(pdf_doc_summary_empty())
+  }
+  out <- do.call(rbind, rows[ok])
+  tibble::as_tibble(out)
+}
+
+# Internal: zero-row tibble matching pdf_doc_summary's column shape.
+# Used by pdf_dir_summary() when the directory is empty (or every
+# file failed under `errors = "skip"` / `"warn"`).
+pdf_doc_summary_empty <- function() {
+  tibble::tibble(
+    path                 = character(),
+    page_count           = integer(),
+    file_version         = integer(),
+    title                = character(),
+    author               = character(),
+    subject              = character(),
+    keywords             = character(),
+    creator              = character(),
+    producer             = character(),
+    creation_date        = character(),
+    mod_date             = character(),
+    trapped              = character(),
+    creation_date_parsed = as.POSIXct(character(), tz = "UTC"),
+    mod_date_parsed      = as.POSIXct(character(), tz = "UTC"),
+    is_tagged            = logical(),
+    is_encrypted         = logical(),
+    security_revision    = integer(),
+    xref_valid           = logical(),
+    bookmark_count       = integer(),
+    attachment_count     = integer(),
+    signature_count      = integer(),
+    form_field_count     = integer(),
+    javascript_count     = integer(),
+    named_dest_count     = integer(),
+    has_page_labels      = logical(),
+    file_id_permanent    = character(),
+    file_id_changing     = character()
+  )
+}
+
+# Internal: convert pdf_doc_file_id()'s raw return to a hex string,
+# or NA_character_ when empty. Hoisted from pdf_doc_summary so its
+# two branches can be unit-tested without a fixture that carries an
+# `/ID` array (no shipped fixture does).
+file_id_hex_or_na <- function(r) {
+  if (length(r) == 0L) {
+    return(NA_character_)
+  }
+  paste(format(r), collapse = "")
+}
-Original file line number
+Diff line change
@@ Expand Up / @@ -72,6 +72,7 @@ print.pdfium_doc <- function(x, ...) { @@
       invisible(x)
     }
     #' Construct a `pdfium_page` from an external pointer
     #'
     #' Internal helper. The page's externalptr carries its parent document's
@@ Expand Down @@