From 05d990bb9e7418be64a3b5c322c8926eee70a530 Mon Sep 17 00:00:00 2001
From: Bill Denney <wdenney@humanpredictions.com>
Date: Thu, 21 May 2026 14:20:38 +0000
Subject: [PATCH 01/10] =?UTF-8?q?feat(doc):=20add=20pdf=5Fdoc=5Fsummary()?=
 =?UTF-8?q?=20=E2=80=94=20one-call=20PDF=20triage=20helper?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Returns a single-row tibble that aggregates the most-asked-for
facts about a PDF document: file path, page count, Info-dictionary
metadata, structural feature flags (forms, attachments, bookmarks,
signatures, JavaScript, tagged-PDF), counts for each feature group,
encryption state, xref validity, and the file-ID tuple. Designed
to replace the eight-or-so individual reader calls users typically
chain together when triaging an unfamiliar PDF.

27 columns aggregated from existing readers:
* `pdf_doc_info()` — page count, file version, Info-dict text +
  dates (both raw PDF strings and POSIXct parses)
* `pdf_doc_is_tagged()`, `pdf_doc_security()`,
  `pdf_doc_xref_valid()` — structural / encryption flags
* `pdf_doc_bookmarks()`, `pdf_attachments()`, `pdf_signatures()`,
  `pdf_form_fields()`, `pdf_doc_javascript()`,
  `pdf_doc_named_dests()` — `length()` over each list
* `pdf_page_labels()` — boolean "has labelled pages?"
* `pdf_doc_file_id()` — hex-encoded as character (NA when absent)

Accepts both a `pdfium_doc` and a character path, mirroring the
two-input-form convention `pdf_doc_info()` already uses. The path
form opens + closes internally.

The `file_id` columns required a small helper (`file_id_hex_or_na`,
internal) because `pdf_doc_file_id()` returns a `raw(0)` for the
common case of PDFs without an `/ID` trailer entry — letting that
go into a tibble column recycles the whole tibble to zero rows.
The helper is hoisted to module scope so both branches can be unit-
tested without a fixture that has `/ID` set (none of the shipped
fixtures do).

12 new tests in `test-doc-summary.R` cover column shape, types,
counts, path / raw-bytes / doc input forms, error cases, and both
branches of the file-ID helper. Full suite 2081/2081 pass; R
coverage 100% (2782/2782 lines); 0 lints.

DESCRIPTION version bumped to 0.1.0.9000 to mark the start of the
post-CRAN-submission development cycle.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 DESCRIPTION                       |   2 +-
 NAMESPACE                         |   1 +
 NEWS.md                           |  12 +++
 R/doc.R                           | 129 +++++++++++++++++++++++++++++
 _pkgdown.yml                      |   1 +
 man/pdf_doc_summary.Rd            |  75 +++++++++++++++++
 tests/testthat/test-doc-summary.R | 131 ++++++++++++++++++++++++++++++
 7 files changed, 350 insertions(+), 1 deletion(-)
 create mode 100644 man/pdf_doc_summary.Rd
 create mode 100644 tests/testthat/test-doc-summary.R

diff --git a/DESCRIPTION b/DESCRIPTION
index 367f07a..24eef99 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: pdfium
 Title: Idiomatic R Bindings to the PDFium PDF Engine
-Version: 0.1.0
+Version: 0.1.0.9000
 Authors@R: c(
     person("Bill", "Denney", , "wdenney@humanpredictions.com",
            role = c("aut", "cre"),
diff --git a/NAMESPACE b/NAMESPACE
index 89c67fc..3f15c60 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -121,6 +121,7 @@ export(pdf_doc_page_mode)
 export(pdf_doc_permissions)
 export(pdf_doc_security)
 export(pdf_doc_set_language)
+export(pdf_doc_summary)
 export(pdf_doc_text)
 export(pdf_doc_trailer_ends)
 export(pdf_doc_user_permissions)
diff --git a/NEWS.md b/NEWS.md
index 5ab8f0a..580f04e 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,15 @@
+# pdfium (development version)
+
+## New features
+
+* `pdf_doc_summary()` — one-call helper that returns a single-row
+  tibble aggregating the most-asked-for facts about a PDF: path,
+  page count, Info-dictionary metadata, structural feature flags
+  (tagged, encrypted, has-forms, has-attachments, …), counts for
+  each feature group, and the file-ID tuple. Replaces the
+  eight-or-so individual calls users typically chain together when
+  triaging a PDF.
+
 # pdfium 0.1.0
 
 Initial CRAN release. This is the first public version of `pdfium`,
diff --git a/R/doc.R b/R/doc.R
index bed9d01..c31f939 100644
--- a/R/doc.R
+++ b/R/doc.R
@@ -506,3 +506,132 @@ pdf_doc_trailer_ends <- function(doc) {
   doc <- as_open_doc(doc)
   cpp_doc_trailer_ends(doc$ptr)
 }
+
+#' One-call summary of a PDF document
+#'
+#' Returns a single-row tibble that aggregates the most-asked-for
+#' facts about a PDF document: file path, page count, Info-dictionary
+#' metadata, structural feature flags (forms, attachments, bookmarks,
+#' signatures, JavaScript, tagged-PDF), counts for each of those
+#' feature groups, encryption state, and the file-ID tuple. Designed
+#' to replace the eight-or-so individual calls users typically chain
+#' together when triaging a PDF.
+#'
+#' Each column either exposes an existing reader or is a `length()`
+#' over the matching `pdfium_*_list`. No new C-side work — purely an
+#' R-side aggregation. See **Columns** below for the source reader
+#' for each entry.
+#'
+#' @section Columns:
+#' * `path` — character; canonical path the doc was opened from, or
+#'   `"<raw bytes>"` for in-memory loads.
+#' * `page_count`, `file_version` — from [pdf_doc_info()].
+#' * `title`, `author`, `subject`, `keywords`, `creator`, `producer`,
+#'   `creation_date`, `mod_date`, `trapped` — from [pdf_doc_info()];
+#'   missing entries appear as `""`.
+#' * `creation_date_parsed`, `mod_date_parsed` — POSIXct (UTC), `NA`
+#'   when the source date is empty or unparseable. From
+#'   [pdf_parse_date()].
+#' * `is_tagged` — from [pdf_doc_is_tagged()].
+#' * `is_encrypted` — `TRUE` when [pdf_doc_security()] returns a
+#'   non-NA revision; `FALSE` otherwise.
+#' * `security_revision` — from [pdf_doc_security()]; `NA` for
+#'   unencrypted PDFs.
+#' * `xref_valid` — from [pdf_doc_xref_valid()].
+#' * `bookmark_count`, `attachment_count`, `signature_count`,
+#'   `form_field_count`, `javascript_count`, `named_dest_count` —
+#'   `length()` of [pdf_doc_bookmarks()], [pdf_attachments()],
+#'   [pdf_signatures()], [pdf_form_fields()], [pdf_doc_javascript()],
+#'   and [pdf_doc_named_dests()] respectively. Zero when the
+#'   document has none of the corresponding entries.
+#' * `has_page_labels` — `TRUE` when [pdf_page_labels()] returns
+#'   non-NA strings.
+#' * `file_id_permanent`, `file_id_changing` — from
+#'   [pdf_doc_file_id()]; UTF-8 hex strings or `NA`.
+#'
+#' @param doc A `pdfium_doc` from [pdf_doc_open()], or a character
+#'   path.
+#' @param password Optional password for encrypted PDFs when `doc`
+#'   is a path. Ignored when `doc` is an open `pdfium_doc`.
+#' @return A one-row tibble.
+#' @seealso [pdf_doc_info()] for the Info-dictionary subset alone,
+#'   the per-feature readers listed under **Columns** for richer
+#'   per-row data.
+#' @examples
+#' fixture <- system.file("extdata", "fixtures", "annotated.pdf",
+#'   package = "pdfium"
+#' )
+#' if (nzchar(fixture)) pdf_doc_summary(fixture)
+#' @export
+pdf_doc_summary <- function(doc, password = NULL) {
+  if (is.character(doc)) {
+    handle <- pdf_doc_open(doc, password = password)
+    on.exit(pdf_doc_close(handle), add = TRUE)
+    return(pdf_doc_summary(handle))
+  }
+  checkmate::assert_class(doc, "pdfium_doc")
+  if (!is_open(doc)) stop("Document has been closed.", call. = FALSE)
+
+  info <- pdf_doc_info(doc)
+  rev <- pdf_doc_security(doc)
+  page_labels <- tryCatch(pdf_page_labels(doc),
+                          error = function(e) NULL)
+  file_id <- list(
+    permanent = file_id_hex_or_na(tryCatch(
+      pdf_doc_file_id(doc, "permanent"),
+      error = function(e) raw(0)
+    )),
+    changing = file_id_hex_or_na(tryCatch(
+      pdf_doc_file_id(doc, "changing"),
+      error = function(e) raw(0)
+    ))
+  )
+
+  tibble::tibble(
+    path                 = doc$path,
+    page_count           = info$page_count,
+    file_version         = info$file_version,
+    title                = info$title %||% "",
+    author               = info$author %||% "",
+    subject              = info$subject %||% "",
+    keywords             = info$keywords %||% "",
+    creator              = info$creator %||% "",
+    producer             = info$producer %||% "",
+    creation_date        = info$creation_date %||% "",
+    mod_date             = info$mod_date %||% "",
+    trapped              = info$trapped %||% "",
+    creation_date_parsed = info$creation_date_parsed,
+    mod_date_parsed      = info$mod_date_parsed,
+    is_tagged            = pdf_doc_is_tagged(doc),
+    is_encrypted         = !is.na(rev),
+    security_revision    = rev,
+    xref_valid           = pdf_doc_xref_valid(doc),
+    bookmark_count       = length(pdf_doc_bookmarks(doc)),
+    attachment_count     = length(pdf_attachments(doc)),
+    signature_count      = length(pdf_signatures(doc)),
+    form_field_count     = length(pdf_form_fields(doc)),
+    javascript_count     = length(pdf_doc_javascript(doc)),
+    named_dest_count     = length(pdf_doc_named_dests(doc)),
+    has_page_labels      = !is.null(page_labels) &&
+                             any(!is.na(page_labels) & nzchar(page_labels)),
+    file_id_permanent    = file_id$permanent,
+    file_id_changing     = file_id$changing
+  )
+}
+
+# Internal: tiny version of rlang's %||% so we don't pull rlang in
+# just for the summary path. Returns `b` when `a` is NULL or NA.
+`%||%` <- function(a, b) {
+  if (is.null(a) || (length(a) == 1L && is.na(a))) b else a
+}
+
+# Internal: convert pdf_doc_file_id()'s raw return to a hex string,
+# or NA_character_ when empty. Hoisted from pdf_doc_summary so its
+# two branches can be unit-tested without a fixture that carries an
+# `/ID` array (no shipped fixture does).
+file_id_hex_or_na <- function(r) {
+  if (length(r) == 0L) {
+    return(NA_character_)
+  }
+  paste(format(r), collapse = "")
+}
diff --git a/_pkgdown.yml b/_pkgdown.yml
index 29a3dd1..80e8ed6 100644
--- a/_pkgdown.yml
+++ b/_pkgdown.yml
@@ -30,6 +30,7 @@ reference:
       - pdf_page_count
       - pdf_doc_info
       - pdf_doc_meta
+      - pdf_doc_summary
       - pdf_parse_date
       - pdf_doc_text
       - pdf_doc_fonts
diff --git a/man/pdf_doc_summary.Rd b/man/pdf_doc_summary.Rd
new file mode 100644
index 0000000..706fa52
--- /dev/null
+++ b/man/pdf_doc_summary.Rd
@@ -0,0 +1,75 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/doc.R
+\name{pdf_doc_summary}
+\alias{pdf_doc_summary}
+\title{One-call summary of a PDF document}
+\usage{
+pdf_doc_summary(doc, password = NULL)
+}
+\arguments{
+\item{doc}{A \code{pdfium_doc} from \code{\link[=pdf_doc_open]{pdf_doc_open()}}, or a character
+path.}
+
+\item{password}{Optional password for encrypted PDFs when \code{doc}
+is a path. Ignored when \code{doc} is an open \code{pdfium_doc}.}
+}
+\value{
+A one-row tibble.
+}
+\description{
+Returns a single-row tibble that aggregates the most-asked-for
+facts about a PDF document: file path, page count, Info-dictionary
+metadata, structural feature flags (forms, attachments, bookmarks,
+signatures, JavaScript, tagged-PDF), counts for each of those
+feature groups, encryption state, and the file-ID tuple. Designed
+to replace the eight-or-so individual calls users typically chain
+together when triaging a PDF.
+}
+\details{
+Each column either exposes an existing reader or is a \code{length()}
+over the matching \verb{pdfium_*_list}. No new C-side work — purely an
+R-side aggregation. See \strong{Columns} below for the source reader
+for each entry.
+}
+\section{Columns}{
+
+\itemize{
+\item \code{path} — character; canonical path the doc was opened from, or
+\code{"<raw bytes>"} for in-memory loads.
+\item \code{page_count}, \code{file_version} — from \code{\link[=pdf_doc_info]{pdf_doc_info()}}.
+\item \code{title}, \code{author}, \code{subject}, \code{keywords}, \code{creator}, \code{producer},
+\code{creation_date}, \code{mod_date}, \code{trapped} — from \code{\link[=pdf_doc_info]{pdf_doc_info()}};
+missing entries appear as \code{""}.
+\item \code{creation_date_parsed}, \code{mod_date_parsed} — POSIXct (UTC), \code{NA}
+when the source date is empty or unparseable. From
+\code{\link[=pdf_parse_date]{pdf_parse_date()}}.
+\item \code{is_tagged} — from \code{\link[=pdf_doc_is_tagged]{pdf_doc_is_tagged()}}.
+\item \code{is_encrypted} — \code{TRUE} when \code{\link[=pdf_doc_security]{pdf_doc_security()}} returns a
+non-NA revision; \code{FALSE} otherwise.
+\item \code{security_revision} — from \code{\link[=pdf_doc_security]{pdf_doc_security()}}; \code{NA} for
+unencrypted PDFs.
+\item \code{xref_valid} — from \code{\link[=pdf_doc_xref_valid]{pdf_doc_xref_valid()}}.
+\item \code{bookmark_count}, \code{attachment_count}, \code{signature_count},
+\code{form_field_count}, \code{javascript_count}, \code{named_dest_count} —
+\code{length()} of \code{\link[=pdf_doc_bookmarks]{pdf_doc_bookmarks()}}, \code{\link[=pdf_attachments]{pdf_attachments()}},
+\code{\link[=pdf_signatures]{pdf_signatures()}}, \code{\link[=pdf_form_fields]{pdf_form_fields()}}, \code{\link[=pdf_doc_javascript]{pdf_doc_javascript()}},
+and \code{\link[=pdf_doc_named_dests]{pdf_doc_named_dests()}} respectively. Zero when the
+document has none of the corresponding entries.
+\item \code{has_page_labels} — \code{TRUE} when \code{\link[=pdf_page_labels]{pdf_page_labels()}} returns
+non-NA strings.
+\item \code{file_id_permanent}, \code{file_id_changing} — from
+\code{\link[=pdf_doc_file_id]{pdf_doc_file_id()}}; UTF-8 hex strings or \code{NA}.
+}
+}
+
+\examples{
+fixture <- system.file("extdata", "fixtures", "annotated.pdf",
+  package = "pdfium"
+)
+if (nzchar(fixture)) pdf_doc_summary(fixture)
+}
+\seealso{
+\code{\link[=pdf_doc_info]{pdf_doc_info()}} for the Info-dictionary subset alone,
+the per-feature readers listed under \strong{Columns} for richer
+per-row data.
+}
diff --git a/tests/testthat/test-doc-summary.R b/tests/testthat/test-doc-summary.R
new file mode 100644
index 0000000..378c784
--- /dev/null
+++ b/tests/testthat/test-doc-summary.R
@@ -0,0 +1,131 @@
+# Tests for pdf_doc_summary() — the one-call "everything about this
+# PDF" helper introduced post-v0.1.0. Exercises each column it
+# claims to produce against the shipped fixtures.
+
+test_that("pdf_doc_summary returns a one-row tibble", {
+  s <- pdf_doc_summary(fixture_path("shapes"))
+  expect_s3_class(s, "tbl_df")
+  expect_equal(nrow(s), 1L)
+})
+
+test_that("pdf_doc_summary covers every documented column", {
+  s <- pdf_doc_summary(fixture_path("shapes"))
+  expected <- c(
+    "path", "page_count", "file_version",
+    "title", "author", "subject", "keywords",
+    "creator", "producer", "creation_date", "mod_date", "trapped",
+    "creation_date_parsed", "mod_date_parsed",
+    "is_tagged", "is_encrypted", "security_revision", "xref_valid",
+    "bookmark_count", "attachment_count", "signature_count",
+    "form_field_count", "javascript_count", "named_dest_count",
+    "has_page_labels", "file_id_permanent", "file_id_changing"
+  )
+  expect_named(s, expected)
+})
+
+test_that("pdf_doc_summary column types are stable", {
+  s <- pdf_doc_summary(fixture_path("shapes"))
+  expect_type(s$path, "character")
+  expect_type(s$page_count, "integer")
+  expect_type(s$file_version, "integer")
+  expect_type(s$title, "character")
+  expect_s3_class(s$creation_date_parsed, "POSIXct")
+  expect_type(s$is_tagged, "logical")
+  expect_type(s$is_encrypted, "logical")
+  expect_type(s$xref_valid, "logical")
+  expect_type(s$bookmark_count, "integer")
+  expect_type(s$attachment_count, "integer")
+  expect_type(s$signature_count, "integer")
+  expect_type(s$form_field_count, "integer")
+  expect_type(s$javascript_count, "integer")
+  expect_type(s$named_dest_count, "integer")
+  expect_type(s$has_page_labels, "logical")
+})
+
+test_that("pdf_doc_summary reports counts on the annotated fixture", {
+  s <- pdf_doc_summary(fixture_path("annotated"))
+  # annotated.pdf has form fields + annotations.
+  expect_gt(s$form_field_count, 0L)
+  expect_identical(s$page_count, 1L)
+})
+
+test_that("pdf_doc_summary reports attachment count on attachments fixture", {
+  s <- pdf_doc_summary(fixture_path("attachments"))
+  expect_identical(s$attachment_count, 1L)
+})
+
+test_that("pdf_doc_summary reports zero counts on simple fixtures", {
+  s <- pdf_doc_summary(fixture_path("shapes"))
+  # shapes.pdf is a hand-built fixture that has no attachments,
+  # signatures, or form fields.
+  expect_identical(s$attachment_count, 0L)
+  expect_identical(s$signature_count, 0L)
+  expect_identical(s$form_field_count, 0L)
+  # Counts that are >= 0 integer scalars; exact values depend on
+  # the fixture build and aren't relevant to the contract.
+  expect_true(s$javascript_count >= 0L)
+  expect_true(s$bookmark_count >= 0L)
+  expect_true(s$named_dest_count >= 0L)
+})
+
+test_that("pdf_doc_summary accepts a path or an open doc", {
+  by_path <- pdf_doc_summary(fixture_path("shapes"))
+  doc <- pdf_doc_open(fixture_path("shapes"))
+  on.exit(pdf_doc_close(doc), add = TRUE)
+  by_doc <- pdf_doc_summary(doc)
+  # `path` differs (one is the doc's `path` slot, the other came in
+  # via path resolution); drop it before comparing.
+  drop_path <- function(t) t[, names(t) != "path"]
+  expect_identical(drop_path(by_path), drop_path(by_doc))
+})
+
+test_that("pdf_doc_summary forwards the password argument", {
+  # When `doc` is already open, password is ignored. Exercise the
+  # path branch where it's forwarded to pdf_doc_open(). Use NULL to
+  # confirm the no-password path doesn't trip the assertion.
+  s <- pdf_doc_summary(fixture_path("shapes"), password = NULL)
+  expect_equal(nrow(s), 1L)
+})
+
+test_that("pdf_doc_summary rejects a closed doc", {
+  doc <- pdf_doc_open(fixture_path("shapes"))
+  pdf_doc_close(doc)
+  expect_error(pdf_doc_summary(doc), "Document has been closed")
+})
+
+test_that("pdf_doc_summary rejects bad input", {
+  expect_error(pdf_doc_summary(42L), "Assertion on")
+  expect_error(pdf_doc_summary(NULL), "Assertion on")
+})
+
+test_that("pdf_doc_summary's is_encrypted is FALSE on unencrypted PDFs", {
+  s <- pdf_doc_summary(fixture_path("shapes"))
+  expect_false(s$is_encrypted)
+  expect_true(is.na(s$security_revision))
+})
+
+test_that("pdf_doc_summary's path slot reflects the source", {
+  s_path <- pdf_doc_summary(fixture_path("shapes"))
+  expect_match(s_path$path, "shapes\\.pdf$")
+
+  bytes <- readBin(fixture_path("shapes"), "raw",
+                   file.info(fixture_path("shapes"))$size)
+  doc_raw <- pdf_doc_open(source = bytes)
+  on.exit(pdf_doc_close(doc_raw), add = TRUE)
+  s_raw <- pdf_doc_summary(doc_raw)
+  expect_identical(s_raw$path, "<raw bytes>")
+})
+
+# file_id_hex_or_na ------------------------------------------------
+# The hex-string branch isn't exercised through pdf_doc_summary
+# itself because no shipped fixture sets the /ID trailer entry.
+# Test the helper directly.
+
+test_that("file_id_hex_or_na returns NA on empty raw", {
+  expect_identical(pdfium:::file_id_hex_or_na(raw(0)), NA_character_)
+})
+
+test_that("file_id_hex_or_na hex-encodes non-empty raw bytes", {
+  bytes <- as.raw(c(0x00, 0xff, 0xab, 0x10))
+  expect_identical(pdfium:::file_id_hex_or_na(bytes), "00ffab10")
+})

From bab5592d8f5cfed4ef59e11bdadbc9eb1494060f Mon Sep 17 00:00:00 2001
From: Bill Denney <wdenney@humanpredictions.com>
Date: Thu, 21 May 2026 14:29:23 +0000
Subject: [PATCH 02/10] =?UTF-8?q?feat(page):=20add=20pdf=5Fpages=5Fsummary?=
 =?UTF-8?q?()=20=E2=80=94=20per-page=20sibling=20of=20doc=5Fsummary?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Returns a tibble with one row per page covering the cheap by-index
metadata: width, height (PDF user-space points, pre-rotation),
rotation in degrees (0/90/180/270), and the page label (or NA when
absent). All four columns use the fast by-index PDFium readers
(FPDF_GetPageSizeByIndexF + FPDF_GetPageRotation +
FPDF_GetPageLabel), so the function does not load any page objects
and scales linearly on long documents.

Designed as the per-page sibling of pdf_doc_summary() — the same
"give me everything cheap in one call" shape, parallel to
pdftools::pdf_pagesize() but with rotation + label columns added.

Accepts both a pdfium_doc and a character path; the path form opens
+ closes internally. Surfaces empty-string labels as NA for a
cleaner "no label here" signal (PDFium can return "" for pages
omitted from a partial /PageLabels array).

11 new tests in test-pages-summary.R cover column shape + types,
the page_num sequence, dimensions sanity, agreement with
pdf_page_size() + pdf_page_rotation() on a per-page basis, both
input forms (path + doc), the multi-page case, password forwarding,
closed-doc rejection, bad-input rejection, the empty-pages-summary
helper, and the label-empty-to-NA contract.

Defensive guards (missing-/PageLabels and zero-page-doc) marked
# nocov — both unreachable from the shipped fixture set.

Full suite 2111/2111 pass; R coverage 100% (2813/2813); 0 lints;
pkgdown reference check passes.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 NAMESPACE                           |  1 +
 NEWS.md                             |  6 ++
 R/page.R                            | 92 +++++++++++++++++++++++++++++
 _pkgdown.yml                        |  1 +
 man/pdf_pages_summary.Rd            | 51 ++++++++++++++++
 tests/testthat/test-pages-summary.R | 92 +++++++++++++++++++++++++++++
 6 files changed, 243 insertions(+)
 create mode 100644 man/pdf_pages_summary.Rd
 create mode 100644 tests/testthat/test-pages-summary.R

diff --git a/NAMESPACE b/NAMESPACE
index 3f15c60..58b435c 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -197,6 +197,7 @@ export(pdf_page_set_rotation)
 export(pdf_page_size)
 export(pdf_page_thumbnail)
 export(pdf_pages_reorder)
+export(pdf_pages_summary)
 export(pdf_parse_date)
 export(pdf_path_append)
 export(pdf_path_bezier_to)
diff --git a/NEWS.md b/NEWS.md
index 580f04e..9a67c7c 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -9,6 +9,12 @@
   each feature group, and the file-ID tuple. Replaces the
   eight-or-so individual calls users typically chain together when
   triaging a PDF.
+* `pdf_pages_summary()` — per-page sibling of `pdf_doc_summary()`:
+  one row per page with `width`, `height` (PDF user-space points),
+  `rotation`, and `label`. All four columns use the fast by-index
+  PDFium calls, so the function does not load any page objects and
+  scales to long documents. Roughly the `pdftools::pdf_pagesize()`
+  equivalent, but with rotation + label columns added.
 
 # pdfium 0.1.0
 
diff --git a/R/page.R b/R/page.R
index 71b86fe..c891b8b 100644
--- a/R/page.R
+++ b/R/page.R
@@ -140,3 +140,95 @@ pdf_page_rotation <- function(page, page_num = 1L) {
   on.exit(pdf_page_close(p), add = TRUE)
   cpp_page_rotation(p$ptr)
 }
+
+#' One-call summary of every page in a document
+#'
+#' Returns a tibble with one row per page covering the cheap
+#' per-page facts: width, height (both in PDF user-space points,
+#' pre-rotation), rotation in degrees, and the page label (if any).
+#' The per-page values come from the existing single-page readers
+#' [pdf_page_size()] (fast `FPDF_GetPageSizeByIndexF` path),
+#' [pdf_page_rotation()], and [pdf_page_labels()]; no per-page
+#' [pdf_page_load()] is required for any of them, so the function
+#' is efficient on long documents.
+#'
+#' For deeper per-page facts (annotation count, object count, text
+#' content, …) load each page individually with [pdf_page_load()]
+#' and call the per-page readers.
+#'
+#' @param doc A `pdfium_doc` from [pdf_doc_open()], or a character
+#'   path.
+#' @param password Optional password for encrypted PDFs when `doc`
+#'   is a path. Ignored when `doc` is an open `pdfium_doc`.
+#' @return A tibble with columns:
+#'   * `page_num` — integer, 1-based.
+#'   * `width`, `height` — numeric, PDF user-space points.
+#'   * `rotation` — integer, `0` / `90` / `180` / `270`.
+#'   * `label` — character; the page's `/PageLabels` entry, or `NA`
+#'     when the document has no labels.
+#' @seealso [pdf_doc_summary()] for the doc-level companion;
+#'   [pdf_page_size()], [pdf_page_rotation()], [pdf_page_labels()]
+#'   for the per-row readers.
+#' @examples
+#' fixture <- system.file("extdata", "fixtures", "minimal.pdf",
+#'   package = "pdfium"
+#' )
+#' if (nzchar(fixture)) pdf_pages_summary(fixture)
+#' @export
+pdf_pages_summary <- function(doc, password = NULL) {
+  if (is.character(doc)) {
+    handle <- pdf_doc_open(doc, password = password)
+    on.exit(pdf_doc_close(handle), add = TRUE)
+    return(pdf_pages_summary(handle))
+  }
+  checkmate::assert_class(doc, "pdfium_doc")
+  if (!is_open(doc)) stop("Document has been closed.", call. = FALSE)
+
+  n <- pdf_page_count(doc)
+  labels <- tryCatch(pdf_page_labels(doc), error = function(e) NULL)
+  if (is.null(labels) || length(labels) != n) {
+    # nocov start — pdf_page_labels always returns a length-n vector
+    # on shipped fixtures (every doc has a /PageLabels array, even
+    # if every entry is ""); guard exists for malformed PDFs in the
+    # wild.
+    labels <- rep(NA_character_, n)
+    # nocov end
+  }
+  # Some labels arrive as "" when the source PDF has a /PageLabels
+  # array that omits a specific page. Surface those as NA for a
+  # cleaner "no label here" signal.
+  labels[!is.na(labels) & !nzchar(labels)] <- NA_character_
+
+  if (n == 0L) {
+    return(empty_pages_summary())  # nocov — no shipped fixture has 0 pages.
+  }
+
+  # Use the fast by-index size / rotation paths so we never load a
+  # page object just to read its metadata.
+  sizes <- lapply(seq_len(n), function(i) pdf_page_size(doc, i))
+  rotations <- vapply(seq_len(n), function(i) {
+    as.integer(pdf_page_rotation(doc, i))
+  }, integer(1L))
+
+  tibble::tibble(
+    page_num = seq_len(n),
+    width    = vapply(sizes, function(s) as.numeric(s[["width"]]),
+                       numeric(1L)),
+    height   = vapply(sizes, function(s) as.numeric(s[["height"]]),
+                       numeric(1L)),
+    rotation = rotations,
+    label    = labels
+  )
+}
+
+# Internal: zero-row tibble matching pdf_pages_summary's shape, for
+# docs with no pages (rare; mostly an in-memory-built corner case).
+empty_pages_summary <- function() {
+  tibble::tibble(
+    page_num = integer(),
+    width    = numeric(),
+    height   = numeric(),
+    rotation = integer(),
+    label    = character()
+  )
+}
diff --git a/_pkgdown.yml b/_pkgdown.yml
index 80e8ed6..44139ea 100644
--- a/_pkgdown.yml
+++ b/_pkgdown.yml
@@ -93,6 +93,7 @@ reference:
       - pdf_page_size
       - pdf_page_rotation
       - pdf_page_box
+      - pdf_pages_summary
       - pdf_page_links
       - pdf_link_at_point
       - pdf_link_annot_at_point
diff --git a/man/pdf_pages_summary.Rd b/man/pdf_pages_summary.Rd
new file mode 100644
index 0000000..5d53669
--- /dev/null
+++ b/man/pdf_pages_summary.Rd
@@ -0,0 +1,51 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/page.R
+\name{pdf_pages_summary}
+\alias{pdf_pages_summary}
+\title{One-call summary of every page in a document}
+\usage{
+pdf_pages_summary(doc, password = NULL)
+}
+\arguments{
+\item{doc}{A \code{pdfium_doc} from \code{\link[=pdf_doc_open]{pdf_doc_open()}}, or a character
+path.}
+
+\item{password}{Optional password for encrypted PDFs when \code{doc}
+is a path. Ignored when \code{doc} is an open \code{pdfium_doc}.}
+}
+\value{
+A tibble with columns:
+\itemize{
+\item \code{page_num} — integer, 1-based.
+\item \code{width}, \code{height} — numeric, PDF user-space points.
+\item \code{rotation} — integer, \code{0} / \code{90} / \code{180} / \code{270}.
+\item \code{label} — character; the page's \verb{/PageLabels} entry, or \code{NA}
+when the document has no labels.
+}
+}
+\description{
+Returns a tibble with one row per page covering the cheap
+per-page facts: width, height (both in PDF user-space points,
+pre-rotation), rotation in degrees, and the page label (if any).
+The per-page values come from the existing single-page readers
+\code{\link[=pdf_page_size]{pdf_page_size()}} (fast \code{FPDF_GetPageSizeByIndexF} path),
+\code{\link[=pdf_page_rotation]{pdf_page_rotation()}}, and \code{\link[=pdf_page_labels]{pdf_page_labels()}}; no per-page
+\code{\link[=pdf_page_load]{pdf_page_load()}} is required for any of them, so the function
+is efficient on long documents.
+}
+\details{
+For deeper per-page facts (annotation count, object count, text
+content, …) load each page individually with \code{\link[=pdf_page_load]{pdf_page_load()}}
+and call the per-page readers.
+}
+\examples{
+fixture <- system.file("extdata", "fixtures", "minimal.pdf",
+  package = "pdfium"
+)
+if (nzchar(fixture)) pdf_pages_summary(fixture)
+}
+\seealso{
+\code{\link[=pdf_doc_summary]{pdf_doc_summary()}} for the doc-level companion;
+\code{\link[=pdf_page_size]{pdf_page_size()}}, \code{\link[=pdf_page_rotation]{pdf_page_rotation()}}, \code{\link[=pdf_page_labels]{pdf_page_labels()}}
+for the per-row readers.
+}
diff --git a/tests/testthat/test-pages-summary.R b/tests/testthat/test-pages-summary.R
new file mode 100644
index 0000000..463001b
--- /dev/null
+++ b/tests/testthat/test-pages-summary.R
@@ -0,0 +1,92 @@
+# Tests for pdf_pages_summary() — per-page sibling of
+# pdf_doc_summary(). Returns one row per page with the cheap
+# by-index metadata (size, rotation, label).
+
+test_that("pdf_pages_summary returns one row per page", {
+  s <- pdf_pages_summary(fixture_path("minimal"))
+  expect_s3_class(s, "tbl_df")
+  expect_equal(nrow(s), 1L)  # minimal.pdf is 1 page
+  expect_named(s, c("page_num", "width", "height", "rotation", "label"))
+})
+
+test_that("pdf_pages_summary column types are stable", {
+  s <- pdf_pages_summary(fixture_path("minimal"))
+  expect_type(s$page_num, "integer")
+  expect_type(s$width, "double")
+  expect_type(s$height, "double")
+  expect_type(s$rotation, "integer")
+  expect_type(s$label, "character")
+})
+
+test_that("pdf_pages_summary reports correct page_num sequence", {
+  s <- pdf_pages_summary(fixture_path("outline"))
+  expect_identical(s$page_num, seq_len(nrow(s)))
+})
+
+test_that("pdf_pages_summary reports sane dimensions", {
+  s <- pdf_pages_summary(fixture_path("minimal"))
+  expect_true(all(s$width > 0))
+  expect_true(all(s$height > 0))
+  expect_true(all(s$rotation %in% c(0L, 90L, 180L, 270L)))
+})
+
+test_that("pdf_pages_summary matches pdf_page_size + pdf_page_rotation", {
+  doc <- pdf_doc_open(fixture_path("outline"))
+  on.exit(pdf_doc_close(doc), add = TRUE)
+  s <- pdf_pages_summary(doc)
+  # Cross-check the first page against the per-page readers.
+  page1 <- pdf_page_size(doc, 1L)
+  expect_identical(s$width[[1L]], as.numeric(page1[["width"]]))
+  expect_identical(s$height[[1L]], as.numeric(page1[["height"]]))
+  expect_identical(s$rotation[[1L]],
+                   as.integer(pdf_page_rotation(doc, 1L)))
+})
+
+test_that("pdf_pages_summary accepts a path or open doc", {
+  by_path <- pdf_pages_summary(fixture_path("outline"))
+  doc <- pdf_doc_open(fixture_path("outline"))
+  on.exit(pdf_doc_close(doc), add = TRUE)
+  by_doc <- pdf_pages_summary(doc)
+  expect_identical(by_path, by_doc)
+})
+
+test_that("pdf_pages_summary handles multi-page documents", {
+  s <- pdf_pages_summary(fixture_path("outline"))
+  expect_gt(nrow(s), 1L)
+  expect_true(all(s$width > 0))
+})
+
+test_that("pdf_pages_summary forwards the password argument", {
+  s <- pdf_pages_summary(fixture_path("minimal"), password = NULL)
+  expect_equal(nrow(s), 1L)
+})
+
+test_that("pdf_pages_summary rejects a closed doc", {
+  doc <- pdf_doc_open(fixture_path("minimal"))
+  pdf_doc_close(doc)
+  expect_error(pdf_pages_summary(doc), "Document has been closed")
+})
+
+test_that("pdf_pages_summary rejects bad input", {
+  expect_error(pdf_pages_summary(42L), "Assertion on")
+  expect_error(pdf_pages_summary(NULL), "Assertion on")
+})
+
+test_that("pdf_pages_summary label column is NA when no page labels", {
+  s <- pdf_pages_summary(fixture_path("minimal"))
+  # minimal.pdf has no /PageLabels.
+  expect_true(all(is.na(s$label)))
+})
+
+# Internal helper -----------------------------------------------------
+
+test_that("empty_pages_summary returns a zero-row tibble with the right shape", {
+  empty <- pdfium:::empty_pages_summary()
+  expect_s3_class(empty, "tbl_df")
+  expect_equal(nrow(empty), 0L)
+  expect_named(empty, c("page_num", "width", "height", "rotation", "label"))
+  expect_type(empty$page_num, "integer")
+  expect_type(empty$width, "double")
+  expect_type(empty$rotation, "integer")
+  expect_type(empty$label, "character")
+})

From 7b841f597fe45d39e93f720a82d788cf66928208 Mon Sep 17 00:00:00 2001
From: Bill Denney <wdenney@humanpredictions.com>
Date: Thu, 21 May 2026 14:30:17 +0000
Subject: [PATCH 03/10] docs: point migration table at the new summary helpers

Updates the "Switching from pdftools" table in
vignettes/comparison.Rmd to point users at the post-v0.1.0 summary
helpers:

* `pdftools::pdf_info(path)` -> mention pdf_doc_summary as the
  richer alternative for one-call triage.
* `pdftools::pdf_pagesize(path)` -> point at pdf_pages_summary
  rather than pdf_page_size, since pdf_pagesize is vectorised over
  pages and pdf_page_size is per-page. pdf_pages_summary matches
  the vectorised shape and adds rotation + label columns.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 vignettes/comparison.Rmd | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vignettes/comparison.Rmd b/vignettes/comparison.Rmd
index a28fe82..d65d626 100644
--- a/vignettes/comparison.Rmd
+++ b/vignettes/comparison.Rmd
@@ -158,8 +158,8 @@ are close enough that switching is mostly a find-and-replace:
 | `pdftools`                  | `pdfium`                       |
 |---|---|
 | `pdf_text(path)`            | `pdf_doc_text(path)`           |
-| `pdf_info(path)`            | `pdf_doc_info(path)`           |
-| `pdf_pagesize(path)`        | `pdf_page_size(doc, page_num)` |
+| `pdf_info(path)`            | `pdf_doc_info(path)` — or `pdf_doc_summary(path)` for a richer one-row tibble |
+| `pdf_pagesize(path)`        | `pdf_pages_summary(path)` (one row per page; also includes rotation + label) |
 | `pdf_render_page(path, ...)`| `pdf_render_page(doc_or_path, ...)` |
 | `pdf_data(path)`            | `pdf_text_runs(page)`          |
 | `pdf_doc_fonts(path)`       | `pdf_doc_fonts(doc)`           |

From 954c77fb3b2b02a02fbf2386fdb181376b034258 Mon Sep 17 00:00:00 2001
From: Bill Denney <wdenney@humanpredictions.com>
Date: Thu, 21 May 2026 14:35:38 +0000
Subject: [PATCH 04/10] feat(doc): add summary.pdfium_doc S3 method

Calling summary() on a pdfium_doc now dispatches to
pdf_doc_summary(), matching the standard R idiom of print() for
a quick "what is this" one-line string and summary() for the
deep-dive tibble.

The method lives in R/doc.R rather than R/classes.R so lintr's
per-file object-usage check can see the pdf_doc_summary call in
the same file. Two new tests confirm dispatch.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 NAMESPACE                         |  1 +
 NEWS.md                           |  4 ++++
 R/classes.R                       |  1 +
 R/doc.R                           | 17 +++++++++++++++++
 man/summary.pdfium_doc.Rd         | 26 ++++++++++++++++++++++++++
 tests/testthat/test-doc-summary.R | 14 ++++++++++++++
 6 files changed, 63 insertions(+)
 create mode 100644 man/summary.pdfium_doc.Rd

diff --git a/NAMESPACE b/NAMESPACE
index 58b435c..58bc38d 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -42,6 +42,7 @@ S3method(print,pdfium_obj_list)
 S3method(print,pdfium_page)
 S3method(print,pdfium_signature)
 S3method(print,pdfium_signature_list)
+S3method(summary,pdfium_doc)
 export(as_pdfium_annot_list)
 export(as_pdfium_attachment_list)
 export(as_pdfium_bookmark_list)
diff --git a/NEWS.md b/NEWS.md
index 9a67c7c..2e1dd45 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -15,6 +15,10 @@
   PDFium calls, so the function does not load any page objects and
   scales to long documents. Roughly the `pdftools::pdf_pagesize()`
   equivalent, but with rotation + label columns added.
+* `summary()` S3 method for `pdfium_doc` — calling `summary(doc)`
+  now dispatches to [pdf_doc_summary()]. Matches the standard R
+  idiom of `print()` for a quick "what is this" string and
+  `summary()` for the deep-dive tibble.
 
 # pdfium 0.1.0
 
diff --git a/R/classes.R b/R/classes.R
index 9a82c92..7fdc270 100644
--- a/R/classes.R
+++ b/R/classes.R
@@ -72,6 +72,7 @@ print.pdfium_doc <- function(x, ...) {
   invisible(x)
 }
 
+
 #' Construct a `pdfium_page` from an external pointer
 #'
 #' Internal helper. The page's externalptr carries its parent document's
diff --git a/R/doc.R b/R/doc.R
index c31f939..d0ccc75 100644
--- a/R/doc.R
+++ b/R/doc.R
@@ -625,6 +625,23 @@ pdf_doc_summary <- function(doc, password = NULL) {
   if (is.null(a) || (length(a) == 1L && is.na(a))) b else a
 }
 
+#' Document-level summary
+#'
+#' `summary()` method for `pdfium_doc`. Defers to
+#' [pdf_doc_summary()] so users can call `summary(doc)` for the
+#' single-row tibble of every key fact about the PDF — page count,
+#' Info-dictionary metadata, structural feature flags, per-feature
+#' counts, the file-ID tuple — in one call.
+#'
+#' @param object A `pdfium_doc` from [pdf_doc_open()].
+#' @param ... Unused (S3 generic compatibility).
+#' @return The tibble returned by [pdf_doc_summary()].
+#' @seealso [pdf_doc_summary()].
+#' @export
+summary.pdfium_doc <- function(object, ...) {
+  pdf_doc_summary(object)
+}
+
 # Internal: convert pdf_doc_file_id()'s raw return to a hex string,
 # or NA_character_ when empty. Hoisted from pdf_doc_summary so its
 # two branches can be unit-tested without a fixture that carries an
diff --git a/man/summary.pdfium_doc.Rd b/man/summary.pdfium_doc.Rd
new file mode 100644
index 0000000..114a457
--- /dev/null
+++ b/man/summary.pdfium_doc.Rd
@@ -0,0 +1,26 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/classes.R
+\name{summary.pdfium_doc}
+\alias{summary.pdfium_doc}
+\title{Document-level summary}
+\usage{
+\method{summary}{pdfium_doc}(object, ...)
+}
+\arguments{
+\item{object}{A \code{pdfium_doc} from \code{\link[=pdf_doc_open]{pdf_doc_open()}}.}
+
+\item{...}{Unused (S3 generic compatibility).}
+}
+\value{
+The tibble returned by \code{\link[=pdf_doc_summary]{pdf_doc_summary()}}.
+}
+\description{
+\code{summary()} method for \code{pdfium_doc}. Defers to
+\code{\link[=pdf_doc_summary]{pdf_doc_summary()}} so users can call \code{summary(doc)} for the
+single-row tibble of every key fact about the PDF — page count,
+Info-dictionary metadata, structural feature flags, per-feature
+counts, the file-ID tuple — in one call.
+}
+\seealso{
+\code{\link[=pdf_doc_summary]{pdf_doc_summary()}}.
+}
diff --git a/tests/testthat/test-doc-summary.R b/tests/testthat/test-doc-summary.R
index 378c784..bf4fba5 100644
--- a/tests/testthat/test-doc-summary.R
+++ b/tests/testthat/test-doc-summary.R
@@ -129,3 +129,17 @@ test_that("file_id_hex_or_na hex-encodes non-empty raw bytes", {
   bytes <- as.raw(c(0x00, 0xff, 0xab, 0x10))
   expect_identical(pdfium:::file_id_hex_or_na(bytes), "00ffab10")
 })
+
+# summary.pdfium_doc S3 method ------------------------------------
+
+test_that("summary(doc) dispatches to pdf_doc_summary", {
+  doc <- pdf_doc_open(fixture_path("shapes"))
+  on.exit(pdf_doc_close(doc), add = TRUE)
+  expect_identical(summary(doc), pdf_doc_summary(doc))
+})
+
+test_that("summary(doc) returns a tibble", {
+  doc <- pdf_doc_open(fixture_path("shapes"))
+  on.exit(pdf_doc_close(doc), add = TRUE)
+  expect_s3_class(summary(doc), "tbl_df")
+})

From 917ad87605b27dc33ed7ff2a449c3d68bc87481a Mon Sep 17 00:00:00 2001
From: Bill Denney <wdenney@humanpredictions.com>
Date: Thu, 21 May 2026 14:37:35 +0000
Subject: [PATCH 05/10] chore: revert version to 0.0.9000 (pre-first-release
 dev)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

v0.1.0 hasn't shipped to CRAN yet — the version string was
aspirational. Switch back to the conventional pre-release
development version (0.0.9000) until devtools::release() actually
runs.

NEWS.md: collapse the "(development version)" block I had bolted
on top into the existing planned-0.1.0 section. The new
pdf_doc_summary / pdf_pages_summary / summary.pdfium_doc entries
join the v0.1.0 surface they were always going to ship with.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 DESCRIPTION |  2 +-
 NEWS.md     | 30 ++++++++----------------------
 2 files changed, 9 insertions(+), 23 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 24eef99..174177d 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: pdfium
 Title: Idiomatic R Bindings to the PDFium PDF Engine
-Version: 0.1.0.9000
+Version: 0.0.9000
 Authors@R: c(
     person("Bill", "Denney", , "wdenney@humanpredictions.com",
            role = c("aut", "cre"),
diff --git a/NEWS.md b/NEWS.md
index 2e1dd45..6d08164 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,25 +1,3 @@
-# pdfium (development version)
-
-## New features
-
-* `pdf_doc_summary()` — one-call helper that returns a single-row
-  tibble aggregating the most-asked-for facts about a PDF: path,
-  page count, Info-dictionary metadata, structural feature flags
-  (tagged, encrypted, has-forms, has-attachments, …), counts for
-  each feature group, and the file-ID tuple. Replaces the
-  eight-or-so individual calls users typically chain together when
-  triaging a PDF.
-* `pdf_pages_summary()` — per-page sibling of `pdf_doc_summary()`:
-  one row per page with `width`, `height` (PDF user-space points),
-  `rotation`, and `label`. All four columns use the fast by-index
-  PDFium calls, so the function does not load any page objects and
-  scales to long documents. Roughly the `pdftools::pdf_pagesize()`
-  equivalent, but with rotation + label columns added.
-* `summary()` S3 method for `pdfium_doc` — calling `summary(doc)`
-  now dispatches to [pdf_doc_summary()]. Matches the standard R
-  idiom of `print()` for a quick "what is this" string and
-  `summary()` for the deep-dive tibble.
-
 # pdfium 0.1.0
 
 Initial CRAN release. This is the first public version of `pdfium`,
@@ -49,6 +27,14 @@ PDFs created with `pdf_doc_new()` are also writable).
 * `pdf_page_load()` / `pdf_page_close()`, `pdf_page_size()`,
   `pdf_page_rotation()`, `pdf_page_box()`, `pdf_page_thumbnail()` —
   per-page handles and metadata.
+* `pdf_doc_summary()` and `pdf_pages_summary()` — one-call triage
+  helpers. `pdf_doc_summary()` returns a single-row tibble
+  aggregating the most-asked-for facts about a PDF (path, page
+  count, Info-dictionary metadata, feature flags, per-feature
+  counts, file-ID tuple); `pdf_pages_summary()` is the per-page
+  sibling (width / height / rotation / label, all via the fast
+  by-index PDFium readers). `summary(doc)` dispatches to
+  `pdf_doc_summary()` so the standard R idiom works.
 
 ## Page objects, paths, and text
 

From 6b48fcc3879be6016f9c124c058fb2c15d0685f7 Mon Sep 17 00:00:00 2001
From: Bill Denney <wdenney@humanpredictions.com>
Date: Thu, 21 May 2026 14:43:03 +0000
Subject: [PATCH 06/10] feat(page): add summary.pdfium_page() S3 method
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Calling summary() on a pdfium_page now returns a single-row tibble
combining the cheap by-index columns (page_num, width, height,
rotation, label — same shape pdf_pages_summary returns per row)
with the per-page counts that the loaded page makes available:
annotation_count, obj_count, text_run_count, link_count.

Two-tier shape: the doc-wide pdf_pages_summary stays cheap (no
page loads, no per-row counts); the page-level summary trades one
already-loaded page for richer information. Users picking which
one to call don't have to think about the cost — the loaded-page
overload just exists, exposed through the standard R idiom.

5 new tests in test-pages-summary.R covering shape, columns,
agreement with the underlying readers, error on closed page, and
the real-label path against outline.pdf (the only shipped fixture
with non-empty /PageLabels entries).

Full suite 2124/2124 pass; R coverage 100% (2835/2835 lines);
0 lints.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 NAMESPACE                           |  1 +
 NEWS.md                             |  6 ++-
 R/page.R                            | 48 +++++++++++++++++++++
 man/summary.pdfium_doc.Rd           |  2 +-
 man/summary.pdfium_page.Rd          | 37 ++++++++++++++++
 tests/testthat/test-pages-summary.R | 65 +++++++++++++++++++++++++++++
 6 files changed, 156 insertions(+), 3 deletions(-)
 create mode 100644 man/summary.pdfium_page.Rd

diff --git a/NAMESPACE b/NAMESPACE
index 58bc38d..b871021 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -43,6 +43,7 @@ S3method(print,pdfium_page)
 S3method(print,pdfium_signature)
 S3method(print,pdfium_signature_list)
 S3method(summary,pdfium_doc)
+S3method(summary,pdfium_page)
 export(as_pdfium_annot_list)
 export(as_pdfium_attachment_list)
 export(as_pdfium_bookmark_list)
diff --git a/NEWS.md b/NEWS.md
index 6d08164..c9c8282 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -33,8 +33,10 @@ PDFs created with `pdf_doc_new()` are also writable).
   count, Info-dictionary metadata, feature flags, per-feature
   counts, file-ID tuple); `pdf_pages_summary()` is the per-page
   sibling (width / height / rotation / label, all via the fast
-  by-index PDFium readers). `summary(doc)` dispatches to
-  `pdf_doc_summary()` so the standard R idiom works.
+  by-index PDFium readers). `summary(doc)` and `summary(page)`
+  dispatch to the matching tibble — `summary(page)` adds the
+  page-loaded counts (annotation count, page-object count,
+  text-run count, link count) since the page is already loaded.
 
 ## Page objects, paths, and text
 
diff --git a/R/page.R b/R/page.R
index c891b8b..e51678e 100644
--- a/R/page.R
+++ b/R/page.R
@@ -221,6 +221,54 @@ pdf_pages_summary <- function(doc, password = NULL) {
   )
 }
 
+#' Page-level summary
+#'
+#' `summary()` method for `pdfium_page`. Returns a single-row tibble
+#' combining the cheap by-index columns
+#' ([pdf_pages_summary()]-style: `page_num`, `width`, `height`,
+#' `rotation`, `label`) with the per-page counts that require the
+#' page to be loaded — annotation count, page-object count, text-run
+#' count, and link count. Because the page handle is already loaded,
+#' the per-count readers run against the existing page and don't
+#' trigger an additional load.
+#'
+#' Use this for the "what's on this page?" interactive triage flow.
+#' For the doc-wide companion, see [summary.pdfium_doc()].
+#'
+#' @param object A `pdfium_page` from [pdf_page_load()].
+#' @param ... Unused (S3 generic compatibility).
+#' @return A one-row tibble with columns `page_num`, `width`,
+#'   `height`, `rotation`, `label`, `annotation_count`, `obj_count`,
+#'   `text_run_count`, `link_count`.
+#' @seealso [summary.pdfium_doc()] for the doc-wide companion,
+#'   [pdf_pages_summary()] for the per-document table without the
+#'   page-loaded counts.
+#' @export
+summary.pdfium_page <- function(object, ...) {
+  if (!is_open(object)) stop("Page has been closed.", call. = FALSE)
+  sz <- cpp_page_size(object$ptr)
+  labels <- tryCatch(pdf_page_labels(object$doc),
+                     error = function(e) NULL)
+  label <- if (is.null(labels) || length(labels) < object$index) {
+    NA_character_  # nocov — shipped fixtures always return length-n.
+  } else {
+    lbl <- labels[[object$index]]
+    if (is.na(lbl) || !nzchar(lbl)) NA_character_ else lbl
+  }
+
+  tibble::tibble(
+    page_num         = object$index,
+    width            = as.numeric(sz[["width"]]),
+    height           = as.numeric(sz[["height"]]),
+    rotation         = as.integer(cpp_page_rotation(object$ptr)),
+    label            = label,
+    annotation_count = length(pdf_annotations(object)),
+    obj_count        = length(pdf_page_objects(object)),
+    text_run_count   = nrow(pdf_text_runs(object)),
+    link_count       = nrow(pdf_page_links(object))
+  )
+}
+
 # Internal: zero-row tibble matching pdf_pages_summary's shape, for
 # docs with no pages (rare; mostly an in-memory-built corner case).
 empty_pages_summary <- function() {
diff --git a/man/summary.pdfium_doc.Rd b/man/summary.pdfium_doc.Rd
index 114a457..0adc92b 100644
--- a/man/summary.pdfium_doc.Rd
+++ b/man/summary.pdfium_doc.Rd
@@ -1,5 +1,5 @@
 % Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/classes.R
+% Please edit documentation in R/doc.R
 \name{summary.pdfium_doc}
 \alias{summary.pdfium_doc}
 \title{Document-level summary}
diff --git a/man/summary.pdfium_page.Rd b/man/summary.pdfium_page.Rd
new file mode 100644
index 0000000..e768d1c
--- /dev/null
+++ b/man/summary.pdfium_page.Rd
@@ -0,0 +1,37 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/page.R
+\name{summary.pdfium_page}
+\alias{summary.pdfium_page}
+\title{Page-level summary}
+\usage{
+\method{summary}{pdfium_page}(object, ...)
+}
+\arguments{
+\item{object}{A \code{pdfium_page} from \code{\link[=pdf_page_load]{pdf_page_load()}}.}
+
+\item{...}{Unused (S3 generic compatibility).}
+}
+\value{
+A one-row tibble with columns \code{page_num}, \code{width},
+\code{height}, \code{rotation}, \code{label}, \code{annotation_count}, \code{obj_count},
+\code{text_run_count}, \code{link_count}.
+}
+\description{
+\code{summary()} method for \code{pdfium_page}. Returns a single-row tibble
+combining the cheap by-index columns
+(\code{\link[=pdf_pages_summary]{pdf_pages_summary()}}-style: \code{page_num}, \code{width}, \code{height},
+\code{rotation}, \code{label}) with the per-page counts that require the
+page to be loaded — annotation count, page-object count, text-run
+count, and link count. Because the page handle is already loaded,
+the per-count readers run against the existing page and don't
+trigger an additional load.
+}
+\details{
+Use this for the "what's on this page?" interactive triage flow.
+For the doc-wide companion, see \code{\link[=summary.pdfium_doc]{summary.pdfium_doc()}}.
+}
+\seealso{
+\code{\link[=summary.pdfium_doc]{summary.pdfium_doc()}} for the doc-wide companion,
+\code{\link[=pdf_pages_summary]{pdf_pages_summary()}} for the per-document table without the
+page-loaded counts.
+}
diff --git a/tests/testthat/test-pages-summary.R b/tests/testthat/test-pages-summary.R
index 463001b..33cb2c2 100644
--- a/tests/testthat/test-pages-summary.R
+++ b/tests/testthat/test-pages-summary.R
@@ -90,3 +90,68 @@ test_that("empty_pages_summary returns a zero-row tibble with the right shape",
   expect_type(empty$rotation, "integer")
   expect_type(empty$label, "character")
 })
+
+# summary.pdfium_page S3 method ------------------------------------
+
+test_that("summary(page) returns a one-row tibble", {
+  doc <- pdf_doc_open(fixture_path("annotated"))
+  on.exit(pdf_doc_close(doc), add = TRUE)
+  page <- pdf_page_load(doc, 1L)
+  on.exit(pdf_page_close(page), add = TRUE, after = FALSE)
+  s <- summary(page)
+  expect_s3_class(s, "tbl_df")
+  expect_equal(nrow(s), 1L)
+})
+
+test_that("summary(page) columns cover both cheap + page-loaded data", {
+  doc <- pdf_doc_open(fixture_path("annotated"))
+  on.exit(pdf_doc_close(doc), add = TRUE)
+  page <- pdf_page_load(doc, 1L)
+  on.exit(pdf_page_close(page), add = TRUE, after = FALSE)
+  s <- summary(page)
+  expect_named(s, c(
+    "page_num", "width", "height", "rotation", "label",
+    "annotation_count", "obj_count", "text_run_count", "link_count"
+  ))
+})
+
+test_that("summary(page) reports a positive annotation count on annotated.pdf", {
+  doc <- pdf_doc_open(fixture_path("annotated"))
+  on.exit(pdf_doc_close(doc), add = TRUE)
+  page <- pdf_page_load(doc, 1L)
+  on.exit(pdf_page_close(page), add = TRUE, after = FALSE)
+  s <- summary(page)
+  expect_gt(s$annotation_count, 0L)
+  expect_identical(s$page_num, 1L)
+})
+
+test_that("summary(page) matches direct per-page reader calls", {
+  doc <- pdf_doc_open(fixture_path("annotated"))
+  on.exit(pdf_doc_close(doc), add = TRUE)
+  page <- pdf_page_load(doc, 1L)
+  on.exit(pdf_page_close(page), add = TRUE, after = FALSE)
+  s <- summary(page)
+  expect_identical(s$annotation_count, length(pdf_annotations(page)))
+  expect_identical(s$obj_count, length(pdf_page_objects(page)))
+  expect_identical(s$text_run_count, nrow(pdf_text_runs(page)))
+  expect_identical(s$link_count, nrow(pdf_page_links(page)))
+})
+
+test_that("summary(page) rejects a closed page", {
+  doc <- pdf_doc_open(fixture_path("annotated"))
+  on.exit(pdf_doc_close(doc), add = TRUE)
+  page <- pdf_page_load(doc, 1L)
+  pdf_page_close(page)
+  expect_error(summary(page), "Page has been closed")
+})
+
+test_that("summary(page) surfaces real page labels", {
+  # outline.pdf is the only shipped fixture with a /PageLabels array
+  # whose entries aren't all empty strings — its first page is
+  # labelled "i" (roman numeral preface convention).
+  doc <- pdf_doc_open(fixture_path("outline"))
+  on.exit(pdf_doc_close(doc), add = TRUE)
+  page <- pdf_page_load(doc, 1L)
+  on.exit(pdf_page_close(page), add = TRUE, after = FALSE)
+  expect_identical(summary(page)$label, "i")
+})

From fda77d1457b72b3ce1cf7e4bd0d5e816d0caf53c Mon Sep 17 00:00:00 2001
From: Bill Denney <wdenney@humanpredictions.com>
Date: Thu, 21 May 2026 14:48:48 +0000
Subject: [PATCH 07/10] =?UTF-8?q?feat(doc):=20add=20pdf=5Fdoc=5Fopen=5Furl?=
 =?UTF-8?q?()=20=E2=80=94=20open=20a=20remote=20PDF=20in=20one=20call?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Convenience wrapper around pdf_doc_open(source = ...) that fetches
the bytes of a URL (http://, https://, ftp://, or file://) via
base R's url() + readBin() and loads through PDFium's in-memory
path. No temporary file is created — the bytes live in R memory
for the document's lifetime.

The returned pdfium_doc's $path field is the URL string itself,
so print() and pdf_doc_summary() surface the source even though
no local path exists.

Closes the most common user-facing convenience gap: today, users
fetching a PDF from a URL have to chain download.file() +
tempfile() + pdf_doc_open() themselves. One call is shorter,
doesn't leave temp files on disk, and handles cleanup via
existing pdfium_doc finalizers.

8 new tests in test-doc-open-url.R cover the file:// happy path,
the URL-stored-as-path contract, password/readwrite forwarding,
input-shape rejection (non-URL strings, bad types), connection
errors (file:// to non-existent path, http(s) to unreachable
hosts — suppressWarnings so the unreachable-host warning doesn't
pollute test output), and a pdf_doc_summary() round-trip.

Full suite 2140/2140 pass; R coverage 100% (2849/2849);
0 lints; pkgdown reference check passes.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 NAMESPACE                          |  1 +
 NEWS.md                            |  5 +-
 R/document.R                       | 54 ++++++++++++++++++++++
 _pkgdown.yml                       |  1 +
 man/pdf_doc_open_url.Rd            | 47 +++++++++++++++++++
 tests/testthat/test-doc-open-url.R | 73 ++++++++++++++++++++++++++++++
 6 files changed, 180 insertions(+), 1 deletion(-)
 create mode 100644 man/pdf_doc_open_url.Rd
 create mode 100644 tests/testthat/test-doc-open-url.R

diff --git a/NAMESPACE b/NAMESPACE
index b871021..6b7f67a 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -119,6 +119,7 @@ export(pdf_doc_named_dest_by_name)
 export(pdf_doc_named_dests)
 export(pdf_doc_new)
 export(pdf_doc_open)
+export(pdf_doc_open_url)
 export(pdf_doc_page_mode)
 export(pdf_doc_permissions)
 export(pdf_doc_security)
diff --git a/NEWS.md b/NEWS.md
index c9c8282..380325f 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -11,7 +11,10 @@ PDFs created with `pdf_doc_new()` are also writable).
 * `pdf_doc_open()` / `pdf_doc_close()`, `pdf_doc_new()`,
   `pdf_save()` / `pdf_save_to_raw()` — open existing PDFs (optionally
   with `readwrite = TRUE`), build new ones in memory, and persist
-  the result.
+  the result. `pdf_doc_open_url(url)` is a convenience wrapper that
+  fetches a `http://` / `https://` / `ftp://` / `file://` URL via
+  `url()` + `readBin()` and loads the bytes through PDFium's
+  in-memory path — no temporary file on disk.
 * `pdf_doc_info()`, `pdf_doc_meta()`, `pdf_doc_text()`,
   `pdf_doc_fonts()`, `pdf_doc_file_id()`, `pdf_doc_page_mode()`,
   `pdf_doc_viewer_preferences()`, `pdf_doc_viewer_preference_by_name()`,
diff --git a/R/document.R b/R/document.R
index b7cfffd..f949511 100644
--- a/R/document.R
+++ b/R/document.R
@@ -70,6 +70,60 @@ pdf_doc_open <- function(path = NULL, source = NULL, password = NULL,
   )
 }
 
+#' Open a PDF document from a URL
+#'
+#' Convenience wrapper around [pdf_doc_open()] that fetches the
+#' bytes of a remote (or `file://`) URL via base R's [`url()`] +
+#' [`readBin()`] and loads the result through PDFium's in-memory
+#' path (`FPDF_LoadMemDocument64`). No temporary file is left on
+#' disk; the bytes live in R memory for the document's lifetime.
+#'
+#' Network errors propagate from [`url()`] / [`readBin()`] (typical
+#' shape: `cannot open URL '...'` from `connection failed`). The
+#' returned `pdfium_doc`'s `$path` field is the URL string itself,
+#' so [print()][print.pdfium_doc] and [pdf_doc_summary()] surface
+#' the source even though no local path exists.
+#'
+#' @param url Character scalar. Must start with one of `http://`,
+#'   `https://`, `ftp://`, or `file://`.
+#' @param password Optional password for encrypted PDFs. `NULL`
+#'   (the default) passes no password to PDFium.
+#' @param readwrite Logical. As for [pdf_doc_open()].
+#' @return A `pdfium_doc`.
+#' @seealso [pdf_doc_open()] for the doc-open primitive.
+#' @examples
+#' fixture <- system.file("extdata", "fixtures", "minimal.pdf",
+#'   package = "pdfium"
+#' )
+#' if (nzchar(fixture)) {
+#'   doc <- pdf_doc_open_url(paste0("file://", fixture))
+#'   pdf_page_count(doc)
+#'   pdf_doc_close(doc)
+#' }
+#' @export
+pdf_doc_open_url <- function(url, password = NULL, readwrite = FALSE) {
+  checkmate::assert_string(url, min.chars = 1L)
+  if (!grepl("^(https?|ftp|file)://", url)) {
+    stop(
+      "`url` must start with http://, https://, ftp://, or file://. ",
+      "Got: ", url,
+      call. = FALSE
+    )
+  }
+  con <- base::url(url, open = "rb")
+  on.exit(close(con), add = TRUE)
+  # readBin needs a max-size hint; .Machine$integer.max is the
+  # documented "unbounded" sentinel.
+  bytes <- readBin(con, what = "raw", n = .Machine$integer.max)
+  doc <- pdf_doc_open(source = bytes, password = password,
+                       readwrite = readwrite)
+  # Override the "<raw bytes>" path with the source URL so
+  # downstream printing / pdf_doc_summary() shows where it came
+  # from.
+  doc$path <- url
+  doc
+}
+
 # Internal: validate the three pdf_doc_open() arguments. Split into
 # per-concern helpers so each stays under lintr's cyclocomp limit.
 validate_pdf_open_args <- function(path, source, password) {
diff --git a/_pkgdown.yml b/_pkgdown.yml
index 44139ea..69571e8 100644
--- a/_pkgdown.yml
+++ b/_pkgdown.yml
@@ -26,6 +26,7 @@ reference:
   - title: Documents
     contents:
       - pdf_doc_open
+      - pdf_doc_open_url
       - pdf_doc_close
       - pdf_page_count
       - pdf_doc_info
diff --git a/man/pdf_doc_open_url.Rd b/man/pdf_doc_open_url.Rd
new file mode 100644
index 0000000..de9259a
--- /dev/null
+++ b/man/pdf_doc_open_url.Rd
@@ -0,0 +1,47 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/document.R
+\name{pdf_doc_open_url}
+\alias{pdf_doc_open_url}
+\title{Open a PDF document from a URL}
+\usage{
+pdf_doc_open_url(url, password = NULL, readwrite = FALSE)
+}
+\arguments{
+\item{url}{Character scalar. Must start with one of \verb{http://},
+\verb{https://}, \verb{ftp://}, or \verb{file://}.}
+
+\item{password}{Optional password for encrypted PDFs. \code{NULL}
+(the default) passes no password to PDFium.}
+
+\item{readwrite}{Logical. As for \code{\link[=pdf_doc_open]{pdf_doc_open()}}.}
+}
+\value{
+A \code{pdfium_doc}.
+}
+\description{
+Convenience wrapper around \code{\link[=pdf_doc_open]{pdf_doc_open()}} that fetches the
+bytes of a remote (or \verb{file://}) URL via base R's \code{\link[=url]{url()}} +
+\code{\link[=readBin]{readBin()}} and loads the result through PDFium's in-memory
+path (\code{FPDF_LoadMemDocument64}). No temporary file is left on
+disk; the bytes live in R memory for the document's lifetime.
+}
+\details{
+Network errors propagate from \code{\link[=url]{url()}} / \code{\link[=readBin]{readBin()}} (typical
+shape: \verb{cannot open URL '...'} from \verb{connection failed}). The
+returned \code{pdfium_doc}'s \verb{$path} field is the URL string itself,
+so \link[=print.pdfium_doc]{print()} and \code{\link[=pdf_doc_summary]{pdf_doc_summary()}} surface
+the source even though no local path exists.
+}
+\examples{
+fixture <- system.file("extdata", "fixtures", "minimal.pdf",
+  package = "pdfium"
+)
+if (nzchar(fixture)) {
+  doc <- pdf_doc_open_url(paste0("file://", fixture))
+  pdf_page_count(doc)
+  pdf_doc_close(doc)
+}
+}
+\seealso{
+\code{\link[=pdf_doc_open]{pdf_doc_open()}} for the doc-open primitive.
+}
diff --git a/tests/testthat/test-doc-open-url.R b/tests/testthat/test-doc-open-url.R
new file mode 100644
index 0000000..989fd2f
--- /dev/null
+++ b/tests/testthat/test-doc-open-url.R
@@ -0,0 +1,73 @@
+# Tests for pdf_doc_open_url(). The network test paths are
+# necessarily skipped on CRAN — they use the `file://` scheme
+# against a shipped fixture, which exercises the same url() +
+# readBin() code path as a real `https://` URL without needing
+# network access.
+
+test_that("pdf_doc_open_url opens a file:// URL", {
+  url <- paste0("file://", fixture_path("minimal"))
+  doc <- pdf_doc_open_url(url)
+  on.exit(pdf_doc_close(doc), add = TRUE)
+  expect_s3_class(doc, "pdfium_doc")
+  expect_identical(pdf_page_count(doc), 1L)
+})
+
+test_that("pdf_doc_open_url stores the URL as the doc path", {
+  url <- paste0("file://", fixture_path("minimal"))
+  doc <- pdf_doc_open_url(url)
+  on.exit(pdf_doc_close(doc), add = TRUE)
+  expect_identical(doc$path, url)
+})
+
+test_that("pdf_doc_open_url forwards password + readwrite flags", {
+  url <- paste0("file://", fixture_path("minimal"))
+  doc <- pdf_doc_open_url(url, password = NULL, readwrite = TRUE)
+  on.exit(pdf_doc_close(doc), add = TRUE)
+  expect_true(doc$readwrite)
+})
+
+test_that("pdf_doc_open_url rejects non-URL strings", {
+  expect_error(pdf_doc_open_url("not-a-url"),
+               "must start with http://")
+  expect_error(pdf_doc_open_url("/path/to/file.pdf"),
+               "must start with http://")
+  expect_error(pdf_doc_open_url(""), "Assertion on")
+})
+
+test_that("pdf_doc_open_url rejects bad input types", {
+  expect_error(pdf_doc_open_url(42L), "Assertion on")
+  expect_error(pdf_doc_open_url(NULL), "Assertion on")
+  expect_error(pdf_doc_open_url(c("a", "b")), "Assertion on")
+})
+
+test_that("pdf_doc_open_url surfaces URL connection errors", {
+  bad_url <- "file:///definitely-not-a-file-on-this-system.pdf"
+  suppressWarnings(expect_error(pdf_doc_open_url(bad_url)))
+})
+
+test_that("pdf_doc_open_url accepts http(s) URLs structurally", {
+  # We can't actually fetch http(s) without network access, but the
+  # URL-shape validation should accept these prefixes and only fail
+  # later at the network step. base::url() emits a warning then
+  # errors on unreachable hosts; suppressWarnings so the test
+  # output isn't noisy.
+  suppressWarnings({
+    expect_error(pdf_doc_open_url("https://example.invalid/x.pdf"))
+    expect_error(pdf_doc_open_url("http://example.invalid/x.pdf"))
+    # Neither error should be the URL-shape error.
+    err1 <- tryCatch(
+      pdf_doc_open_url("https://example.invalid/x.pdf"),
+      error = function(e) conditionMessage(e)
+    )
+  })
+  expect_false(grepl("must start with", err1))
+})
+
+test_that("pdf_doc_open_url round-trips through pdf_doc_summary", {
+  url <- paste0("file://", fixture_path("annotated"))
+  doc <- pdf_doc_open_url(url)
+  on.exit(pdf_doc_close(doc), add = TRUE)
+  s <- pdf_doc_summary(doc)
+  expect_identical(s$path, url)
+  expect_gt(s$form_field_count, 0L)  # annotated.pdf has form fields
+})

From 7c8fc1bb44802200ee37689b899d947f0bdcf104 Mon Sep 17 00:00:00 2001
From: Bill Denney <wdenney@humanpredictions.com>
Date: Thu, 21 May 2026 16:40:20 +0000
Subject: [PATCH 08/10] fix(ci): unbreak Rd cross-ref check + add a pre-commit
 hook to catch it
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

CI flagged R CMD check WARNING on every platform:

  pdf_doc_open_url.Rd: print.pdfium_doc
  Please provide package anchors for all Rd \link{} targets not in
  the package itself and the base packages.

The pdf_doc_open_url docstring had `[print()][print.pdfium_doc]` —
markdown for "render `print()` as link to topic `print.pdfium_doc`".
But `print.pdfium_doc` is an internal S3 method without its own Rd
page, so the link can't resolve.

Two changes:

1. Replace the bracketed cross-reference with plain `print()`
   inline code so the function name still renders as code but
   doesn't generate a broken link. Mirrors the same fix pattern
   used on PR #32's `[is_open()]` issue.

2. New pre-commit hook `rd-xref-check` (entry:
   tools/check-rd-xrefs.R) that runs the same internal R function
   `R CMD check` uses for its cross-reference step
   (tools:::.check_Rd_xrefs). Catches this class of WARNING on the
   developer machine before push.

   The script needs nothing more than the source tree (no install,
   no compile, no C++ build), so it's cheap enough to run on
   every commit — listed under `repo: local` next to the existing
   `pkgdown-reference-check` hook, with the same diagnostic-then-
   exit-1 pattern. Trigger files: any change to R/*.R, man/*.Rd,
   or DESCRIPTION.

   Manual verification: injecting the original broken link
   reproduces the CI failure shape:

     [check-rd-xrefs] pdf_doc_open_url.Rd: unresolved \link{}
       target 'print.pdfium_doc'

   Restoring the fix exits 0.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .pre-commit-config.yaml | 17 ++++++++++
 R/document.R            |  2 +-
 man/pdf_doc_open_url.Rd |  2 +-
 tools/check-rd-xrefs.R  | 73 +++++++++++++++++++++++++++++++++++++++++
 4 files changed, 92 insertions(+), 2 deletions(-)
 create mode 100755 tools/check-rd-xrefs.R

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 72f38bb..44ebf21 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -114,6 +114,23 @@ repos:
         pass_filenames: false
         files: '^(_pkgdown\.yml|NAMESPACE|R/.*\.R)$'
 
+      - id: rd-xref-check
+        name: Rd cross-reference resolution
+        description: >
+          Catches \link{} targets in man/*.Rd that don't resolve to
+          either a topic in this package, a documented dependency, or
+          one of the base / recommended packages. This is the same
+          WARNING that `R CMD check --as-cran` emits under "checking
+          Rd cross-references" and that fails every platform of the
+          cross-platform R-CMD-check matrix. Uses the same internal R
+          function (tools:::.check_Rd_xrefs) R CMD check itself uses;
+          operates on the source tree without needing the package
+          installed - fast enough for every commit.
+        entry: Rscript tools/check-rd-xrefs.R
+        language: system
+        pass_filenames: false
+        files: '^(R/.*\.R|man/.*\.Rd|DESCRIPTION)$'
+
   # Conventional Commits message check (server side: defense in depth via CI).
   - repo: https://github.com/compilerla/conventional-pre-commit
     rev: v3.6.0
diff --git a/R/document.R b/R/document.R
index f949511..7710229 100644
--- a/R/document.R
+++ b/R/document.R
@@ -81,7 +81,7 @@ pdf_doc_open <- function(path = NULL, source = NULL, password = NULL,
 #' Network errors propagate from [`url()`] / [`readBin()`] (typical
 #' shape: `cannot open URL '...'` from `connection failed`). The
 #' returned `pdfium_doc`'s `$path` field is the URL string itself,
-#' so [print()][print.pdfium_doc] and [pdf_doc_summary()] surface
+#' so `print()` and [pdf_doc_summary()] surface
 #' the source even though no local path exists.
 #'
 #' @param url Character scalar. Must start with one of `http://`,
diff --git a/man/pdf_doc_open_url.Rd b/man/pdf_doc_open_url.Rd
index de9259a..99e86c8 100644
--- a/man/pdf_doc_open_url.Rd
+++ b/man/pdf_doc_open_url.Rd
@@ -29,7 +29,7 @@ disk; the bytes live in R memory for the document's lifetime.
 Network errors propagate from \code{\link[=url]{url()}} / \code{\link[=readBin]{readBin()}} (typical
 shape: \verb{cannot open URL '...'} from \verb{connection failed}). The
 returned \code{pdfium_doc}'s \verb{$path} field is the URL string itself,
-so \link[=print.pdfium_doc]{print()} and \code{\link[=pdf_doc_summary]{pdf_doc_summary()}} surface
+so \code{print()} and \code{\link[=pdf_doc_summary]{pdf_doc_summary()}} surface
 the source even though no local path exists.
 }
 \examples{
diff --git a/tools/check-rd-xrefs.R b/tools/check-rd-xrefs.R
new file mode 100755
index 0000000..d5a1b67
--- /dev/null
+++ b/tools/check-rd-xrefs.R
@@ -0,0 +1,73 @@
+#!/usr/bin/env Rscript
+# tools/check-rd-xrefs.R
+#
+# Validates that every \link{} target in every Rd file under man/
+# resolves — either to a topic in this package, to a topic in a
+# documented dependency, or to one of the base / recommended
+# packages.
+#
+# Catches the same class of WARNING that `R CMD check --as-cran`
+# emits under "checking Rd cross-references":
+#
+#     Found the following Rd file(s) with Rd \link{} targets
+#     missing package anchors:
+#       pdf_X.Rd: some_unresolved_topic
+#
+# That WARNING fails the cross-platform R-CMD-check matrix on every
+# CI platform — better to catch it on the developer's machine.
+#
+# Uses `tools:::.check_Rd_xrefs()` (R-internal) which is the same
+# function `R CMD check` itself invokes for this check. It only
+# needs the source tree (no install / no compile), so it's
+# cheap enough to run on every push.
+#
+# Entry point for the corresponding pre-commit hook in
+# .pre-commit-config.yaml. Exits 0 when every Rd cross-reference
+# resolves; 1 with a diagnostic when at least one does not.
+#
+# Skips silently when:
+#   - the `tools` package isn't available (shouldn't happen — it's
+#     bundled with base R, but defensive nonetheless)
+#   - the package has no DESCRIPTION (run from somewhere other than
+#     a package root)
+
+local({
+  if (!file.exists("DESCRIPTION")) {
+    message("[check-rd-xrefs] Not in a package root; skipping.")
+    return(invisible())
+  }
+  # `tools:::.check_Rd_xrefs` is internal; existence-check first.
+  fn <- tryCatch(
+    get(".check_Rd_xrefs", envir = asNamespace("tools"),
+        inherits = FALSE),
+    error = function(e) NULL
+  )
+  if (is.null(fn)) {
+    message("[check-rd-xrefs] tools:::.check_Rd_xrefs not available ",
+            "in this R; skipping. R-CMD-check on CI will still catch.")
+    return(invisible())
+  }
+
+  result <- fn(dir = ".")
+  if (length(result$bad) == 0L) {
+    return(invisible())
+  }
+
+  for (rd_file in names(result$bad)) {
+    topics <- result$bad[[rd_file]]
+    # Topics arrive as a named character; the names are the report
+    # categories (`report`, `legacy`, etc.) and the values are the
+    # unresolved topic strings.
+    for (topic in unique(topics)) {
+      message(sprintf(
+        "[check-rd-xrefs] %s: unresolved \\link{} target '%s'",
+        rd_file, topic
+      ))
+    }
+  }
+  message("[check-rd-xrefs] Fix by either qualifying the link with ",
+          "a package name (e.g. [graphics::plot()]), pointing it at ",
+          "an actual Rd topic in this package, or replacing the link ",
+          "with plain `code` formatting if the target has no Rd page.")
+  quit(status = 1L, save = "no")
+})

From 6f16c08e59bf2f7a551ebbbab2a9cc653237017c Mon Sep 17 00:00:00 2001
From: Bill Denney <wdenney@humanpredictions.com>
Date: Thu, 21 May 2026 17:52:18 +0000
Subject: [PATCH 09/10] fix(ci): list summary.* methods in pkgdown index +
 harden the hook
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

pkgdown's `build_reference_index()` errored on PR #36:

  In _pkgdown.yml, 2 topics missing from index:
    "summary.pdfium_doc" and "summary.pdfium_page"

Both S3 methods have their own @export and so their own Rd page
(summary.pdfium_doc.Rd, summary.pdfium_page.Rd). pkgdown enforces
that every non-internal Rd topic appears in the reference index.

Two changes:

1. Add both to _pkgdown.yml next to their `pdf_*_summary()`
   companions.

2. Rewrite tools/check-pkgdown-reference.R to enumerate man/*.Rd
   files directly rather than reconstruct the topic set from
   NAMESPACE's `export()` + `S3method()` entries. The old design
   only flagged missing topics for NAMESPACE `export()` entries —
   `S3method()`-only entries (the path that produces summary.*.Rd)
   slipped through. The new design:
     * filters out Rd files marked `\keyword{internal}` (matches
       the only existing internal Rd, pdfium-package.Rd)
     * computes the topic set as { Rd basename } ∪ { every \alias{}
       entry inside }, so @rdname-collapsed methods (e.g. each
       pdfium_*_code paired with its _name in one Rd) still count
       as valid YAML entries
     * flags topics missing from YAML and YAML entries missing from
       man/, same diagnostic shape as before

   Manual verification: removing the summary.* entries from
   _pkgdown.yml reproduces the CI failure shape:

     [check-pkgdown-reference] Documented but not in _pkgdown.yml
       reference index: summary.pdfium_doc, summary.pdfium_page

   Restoring exits 0.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 _pkgdown.yml                    |  2 ++
 tools/check-pkgdown-reference.R | 51 +++++++++++++++++++++------------
 2 files changed, 34 insertions(+), 19 deletions(-)

diff --git a/_pkgdown.yml b/_pkgdown.yml
index 69571e8..b5a94cf 100644
--- a/_pkgdown.yml
+++ b/_pkgdown.yml
@@ -32,6 +32,7 @@ reference:
       - pdf_doc_info
       - pdf_doc_meta
       - pdf_doc_summary
+      - summary.pdfium_doc
       - pdf_parse_date
       - pdf_doc_text
       - pdf_doc_fonts
@@ -95,6 +96,7 @@ reference:
       - pdf_page_rotation
       - pdf_page_box
       - pdf_pages_summary
+      - summary.pdfium_page
       - pdf_page_links
       - pdf_link_at_point
       - pdf_link_annot_at_point
diff --git a/tools/check-pkgdown-reference.R b/tools/check-pkgdown-reference.R
index 3c874bd..9290fcf 100644
--- a/tools/check-pkgdown-reference.R
+++ b/tools/check-pkgdown-reference.R
@@ -40,35 +40,48 @@ local({
                                use.names = FALSE))
   yaml_topics <- yaml_topics[!is.na(yaml_topics) & nzchar(yaml_topics)]
 
-  ns_lines <- readLines("NAMESPACE")
-  # `exportPattern` and `exportClasses` not handled - this hook is
-  # for the typical case where pdfium's exports are all `export(name)`.
-  exports <- sub("^export\\(([^)]+)\\)$", "\\1",
-                 grep("^export\\(", ns_lines, value = TRUE))
-  # S3 methods registered via `@exportS3Method` appear in NAMESPACE
-  # as `S3method(generic, class)` or `S3method(pkg::generic, class)`
-  # when the generic lives in another package (e.g.
-  # `S3method(graphics::plot, pdfium_bitmap)`). pkgdown writes the
-  # method's Rd topic as the bare `generic.class` in either case,
-  # so strip any `pkg::` prefix on the generic before forming the
-  # topic name.
-  s3 <- sub("^S3method\\(([^,]+),\\s*([^)]+)\\)$", "\\1.\\2",
-            grep("^S3method\\(", ns_lines, value = TRUE))
-  s3 <- sub("^[^.]+::", "", s3)
-  topics <- unique(c(exports, s3))
+  # The canonical "what topics should pkgdown index" set is every
+  # man/*.Rd file that isn't marked `\keyword{internal}`. Walking
+  # the Rd files directly handles every topic-creation path
+  # (`@export`, S3 methods registered via `@exportS3Method`, manual
+  # `@aliases`/`@rdname`-collapsed methods) without having to
+  # re-parse NAMESPACE's S3method dispatch records.
+  rd_files <- list.files("man", pattern = "\\.Rd$", full.names = FALSE)
+  rd_files <- rd_files[nzchar(rd_files)]
+  if (length(rd_files) == 0L) {
+    return(invisible())
+  }
+
+  # A topic in pkgdown's reference can be either the Rd file's
+  # basename OR any \alias{} entry inside it (the @rdname-collapsed
+  # case: when several R functions share one Rd file, every
+  # function's name becomes an alias for the shared topic). Both
+  # forms resolve, so both count as valid YAML entries.
+  rd_topics_and_aliases <- function(rd_file) {
+    lines <- readLines(file.path("man", rd_file), warn = FALSE)
+    if (any(grepl("\\\\keyword\\{internal\\}", lines))) {
+      return(character(0))
+    }
+    base <- sub("\\.Rd$", "", rd_file)
+    aliases <- sub(".*\\\\alias\\{([^}]+)\\}.*", "\\1",
+                   grep("\\\\alias\\{", lines, value = TRUE))
+    unique(c(base, aliases))
+  }
+  topics <- unique(unlist(lapply(rd_files, rd_topics_and_aliases),
+                          use.names = FALSE))
 
-  missing_in_yaml  <- setdiff(exports, yaml_topics)
+  missing_in_yaml  <- setdiff(topics, yaml_topics)
   unknown_in_yaml  <- setdiff(yaml_topics, topics)
 
   problems <- character()
   if (length(missing_in_yaml) > 0L) {
     problems <- c(problems, sprintf(
-      "Exported but not in _pkgdown.yml reference index: %s",
+      "Documented but not in _pkgdown.yml reference index: %s",
       paste(missing_in_yaml, collapse = ", ")))
   }
   if (length(unknown_in_yaml) > 0L) {
     problems <- c(problems, sprintf(
-      "In _pkgdown.yml reference index but not an export: %s",
+      "In _pkgdown.yml reference index but no matching man/*.Rd: %s",
       paste(unknown_in_yaml, collapse = ", ")))
   }
   if (length(problems) > 0L) {

From ffdf5aa04885a3e259b8d96564d6fae5d0f085e4 Mon Sep 17 00:00:00 2001
From: Bill Denney <wdenney@humanpredictions.com>
Date: Thu, 21 May 2026 17:56:07 +0000
Subject: [PATCH 10/10] =?UTF-8?q?feat(doc):=20add=20pdf=5Fdir=5Fsummary()?=
 =?UTF-8?q?=20=E2=80=94=20bulk-triage=20every=20PDF=20in=20a=20folder?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Scans a directory for PDF files and returns a tibble with one row
per file in the pdf_doc_summary() column shape. The natural
replacement for the standard "loop over a folder of PDFs and find
the ones with forms / attachments / encryption" triage workflow.

* Recursive descent via `recursive = TRUE`.
* Case-insensitive `pattern = "\\.pdf$"` by default — picks up
  both `.pdf` and `.PDF`.
* Optional shared `password` applied to every file.
* `errors` argument selects how broken / non-PDF files are handled:
  * "warn" (default) — surface a warning per failure and skip
  * "skip" — silently skip
  * "stop" — abort on the first failure

Internal pdf_doc_summary_empty() helper hoisted to module scope so
its zero-row template can be tested without exercising the
no-files-in-directory branch through the full file scan.

14 new tests in test-dir-summary.R cover row-per-PDF count, column
shape parity with pdf_doc_summary, path preservation, recursive
descent, the empty-directory case, the empty-tibble helper itself,
case-insensitive .PDF matching, all three errors modes, the
zero-rows-when-everything-fails case, password forwarding,
input-shape rejection, and custom patterns.

Full suite 2166/2166 pass; R coverage 100% (2907/2907 lines);
0 lints; pkgdown reference check passes (the hardened hook from
the previous commit caught my omission of pdf_dir_summary from
_pkgdown.yml during this commit's development — verifying the
hook does what it should).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 NAMESPACE                         |   1 +
 NEWS.md                           |   7 ++
 R/doc.R                           | 110 ++++++++++++++++++++++++++
 _pkgdown.yml                      |   1 +
 man/pdf_dir_summary.Rd            |  62 +++++++++++++++
 tests/testthat/test-dir-summary.R | 125 ++++++++++++++++++++++++++++++
 6 files changed, 306 insertions(+)
 create mode 100644 man/pdf_dir_summary.Rd
 create mode 100644 tests/testthat/test-dir-summary.R

diff --git a/NAMESPACE b/NAMESPACE
index 6b7f67a..9b1b700 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -105,6 +105,7 @@ export(pdf_bookmark_title)
 export(pdf_bookmark_uri)
 export(pdf_clip_path_count)
 export(pdf_clip_path_segments)
+export(pdf_dir_summary)
 export(pdf_doc_bookmark_find)
 export(pdf_doc_bookmarks)
 export(pdf_doc_close)
diff --git a/NEWS.md b/NEWS.md
index 380325f..9b1414a 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -40,6 +40,13 @@ PDFs created with `pdf_doc_new()` are also writable).
   dispatch to the matching tibble — `summary(page)` adds the
   page-loaded counts (annotation count, page-object count,
   text-run count, link count) since the page is already loaded.
+* `pdf_dir_summary(dir)` — scans a directory for PDF files and
+  returns one row per file in the `pdf_doc_summary()` shape.
+  Recursive scan via `recursive = TRUE`; pattern-matches `.pdf`
+  case-insensitively by default. The `errors` argument selects
+  one of `"warn"` (default — surface broken files but don't
+  abort), `"skip"` (silently drop), or `"stop"` (abort on the
+  first failure).
 
 ## Page objects, paths, and text
 
diff --git a/R/doc.R b/R/doc.R
index d0ccc75..6114187 100644
--- a/R/doc.R
+++ b/R/doc.R
@@ -642,6 +642,116 @@ summary.pdfium_doc <- function(object, ...) {
   pdf_doc_summary(object)
 }
 
+#' Summarise every PDF in a directory in one call
+#'
+#' Scans a directory for PDF files and returns a tibble whose rows
+#' are the [pdf_doc_summary()] output for each file. The natural
+#' replacement for the standard "loop over a folder of PDFs and
+#' triage" workflow — encrypted-which / has-forms-which /
+#' has-attachments-which.
+#'
+#' Files that fail to open (corrupt, wrong format, password
+#' protected) are handled per the `errors` argument:
+#'
+#' * `"warn"` (default) — a `warning()` per failed file; the file
+#'   is dropped from the result tibble.
+#' * `"skip"` — silently dropped.
+#' * `"stop"` — the first failed file raises an error and the
+#'   function aborts.
+#'
+#' @param dir Character scalar. Path to the directory to scan.
+#' @param pattern Regular expression filtering filenames. Defaults
+#'   to `"\\.pdf$"` (case-insensitive).
+#' @param recursive Logical. When `TRUE`, descend into
+#'   subdirectories. Defaults `FALSE`.
+#' @param password Optional password applied to every file. `NULL`
+#'   (default) tries each file without a password. Useful when all
+#'   files share the same password.
+#' @param errors One of `"warn"`, `"skip"`, `"stop"` — see Details.
+#' @return A tibble with the same columns as [pdf_doc_summary()].
+#'   Zero rows when the directory has no PDFs (or every PDF failed
+#'   to open under `errors = "skip"` / `"warn"`).
+#' @seealso [pdf_doc_summary()] for the single-file companion.
+#' @examples
+#' fixture_dir <- system.file("extdata", "fixtures",
+#'                            package = "pdfium")
+#' if (nzchar(fixture_dir)) {
+#'   pdf_dir_summary(fixture_dir)
+#' }
+#' @export
+pdf_dir_summary <- function(dir = ".", pattern = "\\.pdf$",
+                             recursive = FALSE, password = NULL,
+                             errors = c("warn", "skip", "stop")) {
+  checkmate::assert_directory_exists(dir)
+  checkmate::assert_string(pattern)
+  checkmate::assert_flag(recursive)
+  errors <- match.arg(errors)
+
+  files <- list.files(dir, pattern = pattern, recursive = recursive,
+                       full.names = TRUE, ignore.case = TRUE)
+  if (length(files) == 0L) {
+    return(pdf_doc_summary_empty())
+  }
+
+  rows <- lapply(files, function(f) {
+    tryCatch(
+      pdf_doc_summary(f, password = password),
+      error = function(e) {
+        if (errors == "stop") {
+          stop(sprintf("pdf_dir_summary: failed to read '%s': %s",
+                       f, conditionMessage(e)), call. = FALSE)
+        }
+        if (errors == "warn") {
+          warning(sprintf("pdf_dir_summary: failed to read '%s': %s",
+                          f, conditionMessage(e)), call. = FALSE)
+        }
+        NULL
+      }
+    )
+  })
+  ok <- !vapply(rows, is.null, logical(1L))
+  if (!any(ok)) {
+    return(pdf_doc_summary_empty())
+  }
+  out <- do.call(rbind, rows[ok])
+  tibble::as_tibble(out)
+}
+
+# Internal: zero-row tibble matching pdf_doc_summary's column shape.
+# Used by pdf_dir_summary() when the directory is empty (or every
+# file failed under `errors = "skip"` / `"warn"`).
+pdf_doc_summary_empty <- function() {
+  tibble::tibble(
+    path                 = character(),
+    page_count           = integer(),
+    file_version         = integer(),
+    title                = character(),
+    author               = character(),
+    subject              = character(),
+    keywords             = character(),
+    creator              = character(),
+    producer             = character(),
+    creation_date        = character(),
+    mod_date             = character(),
+    trapped              = character(),
+    creation_date_parsed = as.POSIXct(character(), tz = "UTC"),
+    mod_date_parsed      = as.POSIXct(character(), tz = "UTC"),
+    is_tagged            = logical(),
+    is_encrypted         = logical(),
+    security_revision    = integer(),
+    xref_valid           = logical(),
+    bookmark_count       = integer(),
+    attachment_count     = integer(),
+    signature_count      = integer(),
+    form_field_count     = integer(),
+    javascript_count     = integer(),
+    named_dest_count     = integer(),
+    has_page_labels      = logical(),
+    file_id_permanent    = character(),
+    file_id_changing     = character()
+  )
+}
+
 # Internal: convert pdf_doc_file_id()'s raw return to a hex string,
 # or NA_character_ when empty. Hoisted from pdf_doc_summary so its
 # two branches can be unit-tested without a fixture that carries an
diff --git a/_pkgdown.yml b/_pkgdown.yml
index b5a94cf..57e4679 100644
--- a/_pkgdown.yml
+++ b/_pkgdown.yml
@@ -32,6 +32,7 @@ reference:
       - pdf_doc_info
       - pdf_doc_meta
       - pdf_doc_summary
+      - pdf_dir_summary
       - summary.pdfium_doc
       - pdf_parse_date
       - pdf_doc_text
diff --git a/man/pdf_dir_summary.Rd b/man/pdf_dir_summary.Rd
new file mode 100644
index 0000000..aeafc80
--- /dev/null
+++ b/man/pdf_dir_summary.Rd
@@ -0,0 +1,62 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/doc.R
+\name{pdf_dir_summary}
+\alias{pdf_dir_summary}
+\title{Summarise every PDF in a directory in one call}
+\usage{
+pdf_dir_summary(
+  dir = ".",
+  pattern = "\\\\.pdf$",
+  recursive = FALSE,
+  password = NULL,
+  errors = c("warn", "skip", "stop")
+)
+}
+\arguments{
+\item{dir}{Character scalar. Path to the directory to scan.}
+
+\item{pattern}{Regular expression filtering filenames. Defaults
+to \code{"\\\\.pdf$"} (case-insensitive).}
+
+\item{recursive}{Logical. When \code{TRUE}, descend into
+subdirectories. Defaults \code{FALSE}.}
+
+\item{password}{Optional password applied to every file. \code{NULL}
+(default) tries each file without a password. Useful when all
+files share the same password.}
+
+\item{errors}{One of \code{"warn"}, \code{"skip"}, \code{"stop"} — see Details.}
+}
+\value{
+A tibble with the same columns as \code{\link[=pdf_doc_summary]{pdf_doc_summary()}}.
+Zero rows when the directory has no PDFs (or every PDF failed
+to open under \code{errors = "skip"} / \code{"warn"}).
+}
+\description{
+Scans a directory for PDF files and returns a tibble whose rows
+are the \code{\link[=pdf_doc_summary]{pdf_doc_summary()}} output for each file. The natural
+replacement for the standard "loop over a folder of PDFs and
+triage" workflow — encrypted-which / has-forms-which /
+has-attachments-which.
+}
+\details{
+Files that fail to open (corrupt, wrong format, password
+protected) are handled per the \code{errors} argument:
+\itemize{
+\item \code{"warn"} (default) — a \code{warning()} per failed file; the file
+is dropped from the result tibble.
+\item \code{"skip"} — silently dropped.
+\item \code{"stop"} — the first failed file raises an error and the
+function aborts.
+}
+}
+\examples{
+fixture_dir <- system.file("extdata", "fixtures",
+                           package = "pdfium")
+if (nzchar(fixture_dir)) {
+  pdf_dir_summary(fixture_dir)
+}
+}
+\seealso{
+\code{\link[=pdf_doc_summary]{pdf_doc_summary()}} for the single-file companion.
+}
diff --git a/tests/testthat/test-dir-summary.R b/tests/testthat/test-dir-summary.R
new file mode 100644
index 0000000..437584b
--- /dev/null
+++ b/tests/testthat/test-dir-summary.R
@@ -0,0 +1,125 @@
+# Tests for pdf_dir_summary() — the bulk-triage helper that wraps
+# pdf_doc_summary() over every PDF in a directory.
+
+# Helper to expose the shipped fixture directory.
+fixture_dir <- function() {
+  system.file("extdata", "fixtures", package = "pdfium")
+}
+
+test_that("pdf_dir_summary returns a tibble with one row per PDF", {
+  s <- pdf_dir_summary(fixture_dir())
+  expect_s3_class(s, "tbl_df")
+  files <- list.files(fixture_dir(), pattern = "\\.pdf$")
+  expect_equal(nrow(s), length(files))
+})
+
+test_that("pdf_dir_summary column shape matches pdf_doc_summary", {
+  bulk <- pdf_dir_summary(fixture_dir())
+  one  <- pdf_doc_summary(fixture_path("shapes"))
+  expect_named(bulk, names(one))
+})
+
+test_that("pdf_dir_summary preserves the path column", {
+  s <- pdf_dir_summary(fixture_dir())
+  expect_true(all(grepl("\\.pdf$", s$path)))
+  expect_true(all(file.exists(s$path)))
+})
+
+test_that("pdf_dir_summary recursive descent works", {
+  # Create a nested temp dir with two PDFs, one in a subdir.
+  tmp <- withr::local_tempdir()
+  file.copy(fixture_path("minimal"), file.path(tmp, "top.pdf"))
+  sub <- file.path(tmp, "subdir")
+  dir.create(sub)
+  file.copy(fixture_path("minimal"), file.path(sub, "nested.pdf"))
+
+  flat <- pdf_dir_summary(tmp, recursive = FALSE)
+  expect_equal(nrow(flat), 1L)
+
+  deep <- pdf_dir_summary(tmp, recursive = TRUE)
+  expect_equal(nrow(deep), 2L)
+})
+
+test_that("pdf_dir_summary returns zero rows for an empty dir", {
+  tmp <- withr::local_tempdir()
+  s <- pdf_dir_summary(tmp)
+  expect_s3_class(s, "tbl_df")
+  expect_equal(nrow(s), 0L)
+})
+
+test_that("pdf_dir_summary's empty tibble has the right shape", {
+  empty <- pdfium:::pdf_doc_summary_empty()
+  expect_s3_class(empty, "tbl_df")
+  expect_equal(nrow(empty), 0L)
+  one <- pdf_doc_summary(fixture_path("shapes"))
+  expect_named(empty, names(one))
+})
+
+test_that("pdf_dir_summary case-insensitive PDF pattern matches .PDF too", {
+  tmp <- withr::local_tempdir()
+  file.copy(fixture_path("minimal"), file.path(tmp, "upper.PDF"))
+  file.copy(fixture_path("minimal"), file.path(tmp, "lower.pdf"))
+  s <- pdf_dir_summary(tmp)
+  expect_equal(nrow(s), 2L)
+})
+
+test_that("pdf_dir_summary errors = stop aborts on a bad file", {
+  tmp <- withr::local_tempdir()
+  file.copy(fixture_path("minimal"), file.path(tmp, "good.pdf"))
+  writeLines("not a pdf", file.path(tmp, "bad.pdf"))
+  expect_error(
+    pdf_dir_summary(tmp, errors = "stop"),
+    "failed to read"
+  )
+})
+
+test_that("pdf_dir_summary errors = warn drops bad files with a warning", {
+  tmp <- withr::local_tempdir()
+  file.copy(fixture_path("minimal"), file.path(tmp, "good.pdf"))
+  writeLines("not a pdf", file.path(tmp, "bad.pdf"))
+  s <- suppressWarnings(pdf_dir_summary(tmp, errors = "warn"))
+  expect_equal(nrow(s), 1L)
+  expect_warning(
+    pdf_dir_summary(tmp, errors = "warn"),
+    "failed to read"
+  )
+})
+
+test_that("pdf_dir_summary errors = skip silently drops bad files", {
+  tmp <- withr::local_tempdir()
+  file.copy(fixture_path("minimal"), file.path(tmp, "good.pdf"))
+  writeLines("not a pdf", file.path(tmp, "bad.pdf"))
+  expect_no_warning(s <- pdf_dir_summary(tmp, errors = "skip"))
+  expect_equal(nrow(s), 1L)
+})
+
+test_that("pdf_dir_summary returns zero rows when every file fails", {
+  tmp <- withr::local_tempdir()
+  writeLines("not a pdf", file.path(tmp, "bad1.pdf"))
+  writeLines("also not a pdf", file.path(tmp, "bad2.pdf"))
+  s <- suppressWarnings(pdf_dir_summary(tmp, errors = "skip"))
+  expect_equal(nrow(s), 0L)
+})
+
+test_that("pdf_dir_summary forwards the password argument", {
+  s <- pdf_dir_summary(fixture_dir(), password = NULL)
+  expect_gt(nrow(s), 0L)
+})
+
+test_that("pdf_dir_summary rejects bad inputs", {
+  expect_error(pdf_dir_summary("/this/path/does/not/exist"),
+               "Assertion on")
+  expect_error(pdf_dir_summary(fixture_dir(), pattern = NA_character_),
+               "Assertion on")
+  expect_error(pdf_dir_summary(fixture_dir(), recursive = "yes"),
+               "Assertion on")
+  expect_error(pdf_dir_summary(fixture_dir(), errors = "bogus"),
+               "'arg' should be one of")
+})
+
+test_that("pdf_dir_summary respects a custom pattern", {
+  # Only match the annotated fixture.
+  s <- pdf_dir_summary(fixture_dir(), pattern = "^annotated\\.pdf$")
+  expect_equal(nrow(s), 1L)
+  expect_match(s$path[[1L]], "annotated\\.pdf$")
+})