From 05d990bb9e7418be64a3b5c322c8926eee70a530 Mon Sep 17 00:00:00 2001 From: Bill Denney Date: Thu, 21 May 2026 14:20:38 +0000 Subject: [PATCH 01/10] =?UTF-8?q?feat(doc):=20add=20pdf=5Fdoc=5Fsummary()?= =?UTF-8?q?=20=E2=80=94=20one-call=20PDF=20triage=20helper?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Returns a single-row tibble that aggregates the most-asked-for facts about a PDF document: file path, page count, Info-dictionary metadata, structural feature flags (forms, attachments, bookmarks, signatures, JavaScript, tagged-PDF), counts for each feature group, encryption state, xref validity, and the file-ID tuple. Designed to replace the eight-or-so individual reader calls users typically chain together when triaging an unfamiliar PDF. 27 columns aggregated from existing readers: * `pdf_doc_info()` — page count, file version, Info-dict text + dates (both raw PDF strings and POSIXct parses) * `pdf_doc_is_tagged()`, `pdf_doc_security()`, `pdf_doc_xref_valid()` — structural / encryption flags * `pdf_doc_bookmarks()`, `pdf_attachments()`, `pdf_signatures()`, `pdf_form_fields()`, `pdf_doc_javascript()`, `pdf_doc_named_dests()` — `length()` over each list * `pdf_page_labels()` — boolean "has labelled pages?" * `pdf_doc_file_id()` — hex-encoded as character (NA when absent) Accepts both a `pdfium_doc` and a character path, mirroring the two-input-form convention `pdf_doc_info()` already uses. The path form opens + closes internally. The `file_id` columns required a small helper (`file_id_hex_or_na`, internal) because `pdf_doc_file_id()` returns a `raw(0)` for the common case of PDFs without an `/ID` trailer entry — letting that go into a tibble column recycles the whole tibble to zero rows. The helper is hoisted to module scope so both branches can be unit- tested without a fixture that has `/ID` set (none of the shipped fixtures do). 12 new tests in `test-doc-summary.R` cover column shape, types, counts, path / raw-bytes / doc input forms, error cases, and both branches of the file-ID helper. Full suite 2081/2081 pass; R coverage 100% (2782/2782 lines); 0 lints. DESCRIPTION version bumped to 0.1.0.9000 to mark the start of the post-CRAN-submission development cycle. Co-Authored-By: Claude Opus 4.7 (1M context) --- DESCRIPTION | 2 +- NAMESPACE | 1 + NEWS.md | 12 +++ R/doc.R | 129 +++++++++++++++++++++++++++++ _pkgdown.yml | 1 + man/pdf_doc_summary.Rd | 75 +++++++++++++++++ tests/testthat/test-doc-summary.R | 131 ++++++++++++++++++++++++++++++ 7 files changed, 350 insertions(+), 1 deletion(-) create mode 100644 man/pdf_doc_summary.Rd create mode 100644 tests/testthat/test-doc-summary.R diff --git a/DESCRIPTION b/DESCRIPTION index 367f07a..24eef99 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: pdfium Title: Idiomatic R Bindings to the PDFium PDF Engine -Version: 0.1.0 +Version: 0.1.0.9000 Authors@R: c( person("Bill", "Denney", , "wdenney@humanpredictions.com", role = c("aut", "cre"), diff --git a/NAMESPACE b/NAMESPACE index 89c67fc..3f15c60 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -121,6 +121,7 @@ export(pdf_doc_page_mode) export(pdf_doc_permissions) export(pdf_doc_security) export(pdf_doc_set_language) +export(pdf_doc_summary) export(pdf_doc_text) export(pdf_doc_trailer_ends) export(pdf_doc_user_permissions) diff --git a/NEWS.md b/NEWS.md index 5ab8f0a..580f04e 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,15 @@ +# pdfium (development version) + +## New features + +* `pdf_doc_summary()` — one-call helper that returns a single-row + tibble aggregating the most-asked-for facts about a PDF: path, + page count, Info-dictionary metadata, structural feature flags + (tagged, encrypted, has-forms, has-attachments, …), counts for + each feature group, and the file-ID tuple. Replaces the + eight-or-so individual calls users typically chain together when + triaging a PDF. + # pdfium 0.1.0 Initial CRAN release. This is the first public version of `pdfium`, diff --git a/R/doc.R b/R/doc.R index bed9d01..c31f939 100644 --- a/R/doc.R +++ b/R/doc.R @@ -506,3 +506,132 @@ pdf_doc_trailer_ends <- function(doc) { doc <- as_open_doc(doc) cpp_doc_trailer_ends(doc$ptr) } + +#' One-call summary of a PDF document +#' +#' Returns a single-row tibble that aggregates the most-asked-for +#' facts about a PDF document: file path, page count, Info-dictionary +#' metadata, structural feature flags (forms, attachments, bookmarks, +#' signatures, JavaScript, tagged-PDF), counts for each of those +#' feature groups, encryption state, and the file-ID tuple. Designed +#' to replace the eight-or-so individual calls users typically chain +#' together when triaging a PDF. +#' +#' Each column either exposes an existing reader or is a `length()` +#' over the matching `pdfium_*_list`. No new C-side work — purely an +#' R-side aggregation. See **Columns** below for the source reader +#' for each entry. +#' +#' @section Columns: +#' * `path` — character; canonical path the doc was opened from, or +#' `""` for in-memory loads. +#' * `page_count`, `file_version` — from [pdf_doc_info()]. +#' * `title`, `author`, `subject`, `keywords`, `creator`, `producer`, +#' `creation_date`, `mod_date`, `trapped` — from [pdf_doc_info()]; +#' missing entries appear as `""`. +#' * `creation_date_parsed`, `mod_date_parsed` — POSIXct (UTC), `NA` +#' when the source date is empty or unparseable. From +#' [pdf_parse_date()]. +#' * `is_tagged` — from [pdf_doc_is_tagged()]. +#' * `is_encrypted` — `TRUE` when [pdf_doc_security()] returns a +#' non-NA revision; `FALSE` otherwise. +#' * `security_revision` — from [pdf_doc_security()]; `NA` for +#' unencrypted PDFs. +#' * `xref_valid` — from [pdf_doc_xref_valid()]. +#' * `bookmark_count`, `attachment_count`, `signature_count`, +#' `form_field_count`, `javascript_count`, `named_dest_count` — +#' `length()` of [pdf_doc_bookmarks()], [pdf_attachments()], +#' [pdf_signatures()], [pdf_form_fields()], [pdf_doc_javascript()], +#' and [pdf_doc_named_dests()] respectively. Zero when the +#' document has none of the corresponding entries. +#' * `has_page_labels` — `TRUE` when [pdf_page_labels()] returns +#' non-NA strings. +#' * `file_id_permanent`, `file_id_changing` — from +#' [pdf_doc_file_id()]; UTF-8 hex strings or `NA`. +#' +#' @param doc A `pdfium_doc` from [pdf_doc_open()], or a character +#' path. +#' @param password Optional password for encrypted PDFs when `doc` +#' is a path. Ignored when `doc` is an open `pdfium_doc`. +#' @return A one-row tibble. +#' @seealso [pdf_doc_info()] for the Info-dictionary subset alone, +#' the per-feature readers listed under **Columns** for richer +#' per-row data. +#' @examples +#' fixture <- system.file("extdata", "fixtures", "annotated.pdf", +#' package = "pdfium" +#' ) +#' if (nzchar(fixture)) pdf_doc_summary(fixture) +#' @export +pdf_doc_summary <- function(doc, password = NULL) { + if (is.character(doc)) { + handle <- pdf_doc_open(doc, password = password) + on.exit(pdf_doc_close(handle), add = TRUE) + return(pdf_doc_summary(handle)) + } + checkmate::assert_class(doc, "pdfium_doc") + if (!is_open(doc)) stop("Document has been closed.", call. = FALSE) + + info <- pdf_doc_info(doc) + rev <- pdf_doc_security(doc) + page_labels <- tryCatch(pdf_page_labels(doc), + error = function(e) NULL) + file_id <- list( + permanent = file_id_hex_or_na(tryCatch( + pdf_doc_file_id(doc, "permanent"), + error = function(e) raw(0) + )), + changing = file_id_hex_or_na(tryCatch( + pdf_doc_file_id(doc, "changing"), + error = function(e) raw(0) + )) + ) + + tibble::tibble( + path = doc$path, + page_count = info$page_count, + file_version = info$file_version, + title = info$title %||% "", + author = info$author %||% "", + subject = info$subject %||% "", + keywords = info$keywords %||% "", + creator = info$creator %||% "", + producer = info$producer %||% "", + creation_date = info$creation_date %||% "", + mod_date = info$mod_date %||% "", + trapped = info$trapped %||% "", + creation_date_parsed = info$creation_date_parsed, + mod_date_parsed = info$mod_date_parsed, + is_tagged = pdf_doc_is_tagged(doc), + is_encrypted = !is.na(rev), + security_revision = rev, + xref_valid = pdf_doc_xref_valid(doc), + bookmark_count = length(pdf_doc_bookmarks(doc)), + attachment_count = length(pdf_attachments(doc)), + signature_count = length(pdf_signatures(doc)), + form_field_count = length(pdf_form_fields(doc)), + javascript_count = length(pdf_doc_javascript(doc)), + named_dest_count = length(pdf_doc_named_dests(doc)), + has_page_labels = !is.null(page_labels) && + any(!is.na(page_labels) & nzchar(page_labels)), + file_id_permanent = file_id$permanent, + file_id_changing = file_id$changing + ) +} + +# Internal: tiny version of rlang's %||% so we don't pull rlang in +# just for the summary path. Returns `b` when `a` is NULL or NA. +`%||%` <- function(a, b) { + if (is.null(a) || (length(a) == 1L && is.na(a))) b else a +} + +# Internal: convert pdf_doc_file_id()'s raw return to a hex string, +# or NA_character_ when empty. Hoisted from pdf_doc_summary so its +# two branches can be unit-tested without a fixture that carries an +# `/ID` array (no shipped fixture does). +file_id_hex_or_na <- function(r) { + if (length(r) == 0L) { + return(NA_character_) + } + paste(format(r), collapse = "") +} diff --git a/_pkgdown.yml b/_pkgdown.yml index 29a3dd1..80e8ed6 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -30,6 +30,7 @@ reference: - pdf_page_count - pdf_doc_info - pdf_doc_meta + - pdf_doc_summary - pdf_parse_date - pdf_doc_text - pdf_doc_fonts diff --git a/man/pdf_doc_summary.Rd b/man/pdf_doc_summary.Rd new file mode 100644 index 0000000..706fa52 --- /dev/null +++ b/man/pdf_doc_summary.Rd @@ -0,0 +1,75 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/doc.R +\name{pdf_doc_summary} +\alias{pdf_doc_summary} +\title{One-call summary of a PDF document} +\usage{ +pdf_doc_summary(doc, password = NULL) +} +\arguments{ +\item{doc}{A \code{pdfium_doc} from \code{\link[=pdf_doc_open]{pdf_doc_open()}}, or a character +path.} + +\item{password}{Optional password for encrypted PDFs when \code{doc} +is a path. Ignored when \code{doc} is an open \code{pdfium_doc}.} +} +\value{ +A one-row tibble. +} +\description{ +Returns a single-row tibble that aggregates the most-asked-for +facts about a PDF document: file path, page count, Info-dictionary +metadata, structural feature flags (forms, attachments, bookmarks, +signatures, JavaScript, tagged-PDF), counts for each of those +feature groups, encryption state, and the file-ID tuple. Designed +to replace the eight-or-so individual calls users typically chain +together when triaging a PDF. +} +\details{ +Each column either exposes an existing reader or is a \code{length()} +over the matching \verb{pdfium_*_list}. No new C-side work — purely an +R-side aggregation. See \strong{Columns} below for the source reader +for each entry. +} +\section{Columns}{ + +\itemize{ +\item \code{path} — character; canonical path the doc was opened from, or +\code{""} for in-memory loads. +\item \code{page_count}, \code{file_version} — from \code{\link[=pdf_doc_info]{pdf_doc_info()}}. +\item \code{title}, \code{author}, \code{subject}, \code{keywords}, \code{creator}, \code{producer}, +\code{creation_date}, \code{mod_date}, \code{trapped} — from \code{\link[=pdf_doc_info]{pdf_doc_info()}}; +missing entries appear as \code{""}. +\item \code{creation_date_parsed}, \code{mod_date_parsed} — POSIXct (UTC), \code{NA} +when the source date is empty or unparseable. From +\code{\link[=pdf_parse_date]{pdf_parse_date()}}. +\item \code{is_tagged} — from \code{\link[=pdf_doc_is_tagged]{pdf_doc_is_tagged()}}. +\item \code{is_encrypted} — \code{TRUE} when \code{\link[=pdf_doc_security]{pdf_doc_security()}} returns a +non-NA revision; \code{FALSE} otherwise. +\item \code{security_revision} — from \code{\link[=pdf_doc_security]{pdf_doc_security()}}; \code{NA} for +unencrypted PDFs. +\item \code{xref_valid} — from \code{\link[=pdf_doc_xref_valid]{pdf_doc_xref_valid()}}. +\item \code{bookmark_count}, \code{attachment_count}, \code{signature_count}, +\code{form_field_count}, \code{javascript_count}, \code{named_dest_count} — +\code{length()} of \code{\link[=pdf_doc_bookmarks]{pdf_doc_bookmarks()}}, \code{\link[=pdf_attachments]{pdf_attachments()}}, +\code{\link[=pdf_signatures]{pdf_signatures()}}, \code{\link[=pdf_form_fields]{pdf_form_fields()}}, \code{\link[=pdf_doc_javascript]{pdf_doc_javascript()}}, +and \code{\link[=pdf_doc_named_dests]{pdf_doc_named_dests()}} respectively. Zero when the +document has none of the corresponding entries. +\item \code{has_page_labels} — \code{TRUE} when \code{\link[=pdf_page_labels]{pdf_page_labels()}} returns +non-NA strings. +\item \code{file_id_permanent}, \code{file_id_changing} — from +\code{\link[=pdf_doc_file_id]{pdf_doc_file_id()}}; UTF-8 hex strings or \code{NA}. +} +} + +\examples{ +fixture <- system.file("extdata", "fixtures", "annotated.pdf", + package = "pdfium" +) +if (nzchar(fixture)) pdf_doc_summary(fixture) +} +\seealso{ +\code{\link[=pdf_doc_info]{pdf_doc_info()}} for the Info-dictionary subset alone, +the per-feature readers listed under \strong{Columns} for richer +per-row data. +} diff --git a/tests/testthat/test-doc-summary.R b/tests/testthat/test-doc-summary.R new file mode 100644 index 0000000..378c784 --- /dev/null +++ b/tests/testthat/test-doc-summary.R @@ -0,0 +1,131 @@ +# Tests for pdf_doc_summary() — the one-call "everything about this +# PDF" helper introduced post-v0.1.0. Exercises each column it +# claims to produce against the shipped fixtures. + +test_that("pdf_doc_summary returns a one-row tibble", { + s <- pdf_doc_summary(fixture_path("shapes")) + expect_s3_class(s, "tbl_df") + expect_equal(nrow(s), 1L) +}) + +test_that("pdf_doc_summary covers every documented column", { + s <- pdf_doc_summary(fixture_path("shapes")) + expected <- c( + "path", "page_count", "file_version", + "title", "author", "subject", "keywords", + "creator", "producer", "creation_date", "mod_date", "trapped", + "creation_date_parsed", "mod_date_parsed", + "is_tagged", "is_encrypted", "security_revision", "xref_valid", + "bookmark_count", "attachment_count", "signature_count", + "form_field_count", "javascript_count", "named_dest_count", + "has_page_labels", "file_id_permanent", "file_id_changing" + ) + expect_named(s, expected) +}) + +test_that("pdf_doc_summary column types are stable", { + s <- pdf_doc_summary(fixture_path("shapes")) + expect_type(s$path, "character") + expect_type(s$page_count, "integer") + expect_type(s$file_version, "integer") + expect_type(s$title, "character") + expect_s3_class(s$creation_date_parsed, "POSIXct") + expect_type(s$is_tagged, "logical") + expect_type(s$is_encrypted, "logical") + expect_type(s$xref_valid, "logical") + expect_type(s$bookmark_count, "integer") + expect_type(s$attachment_count, "integer") + expect_type(s$signature_count, "integer") + expect_type(s$form_field_count, "integer") + expect_type(s$javascript_count, "integer") + expect_type(s$named_dest_count, "integer") + expect_type(s$has_page_labels, "logical") +}) + +test_that("pdf_doc_summary reports counts on the annotated fixture", { + s <- pdf_doc_summary(fixture_path("annotated")) + # annotated.pdf has form fields + annotations. + expect_gt(s$form_field_count, 0L) + expect_identical(s$page_count, 1L) +}) + +test_that("pdf_doc_summary reports attachment count on attachments fixture", { + s <- pdf_doc_summary(fixture_path("attachments")) + expect_identical(s$attachment_count, 1L) +}) + +test_that("pdf_doc_summary reports zero counts on simple fixtures", { + s <- pdf_doc_summary(fixture_path("shapes")) + # shapes.pdf is a hand-built fixture that has no attachments, + # signatures, or form fields. + expect_identical(s$attachment_count, 0L) + expect_identical(s$signature_count, 0L) + expect_identical(s$form_field_count, 0L) + # Counts that are >= 0 integer scalars; exact values depend on + # the fixture build and aren't relevant to the contract. + expect_true(s$javascript_count >= 0L) + expect_true(s$bookmark_count >= 0L) + expect_true(s$named_dest_count >= 0L) +}) + +test_that("pdf_doc_summary accepts a path or an open doc", { + by_path <- pdf_doc_summary(fixture_path("shapes")) + doc <- pdf_doc_open(fixture_path("shapes")) + on.exit(pdf_doc_close(doc), add = TRUE) + by_doc <- pdf_doc_summary(doc) + # `path` differs (one is the doc's `path` slot, the other came in + # via path resolution); drop it before comparing. + drop_path <- function(t) t[, names(t) != "path"] + expect_identical(drop_path(by_path), drop_path(by_doc)) +}) + +test_that("pdf_doc_summary forwards the password argument", { + # When `doc` is already open, password is ignored. Exercise the + # path branch where it's forwarded to pdf_doc_open(). Use NULL to + # confirm the no-password path doesn't trip the assertion. + s <- pdf_doc_summary(fixture_path("shapes"), password = NULL) + expect_equal(nrow(s), 1L) +}) + +test_that("pdf_doc_summary rejects a closed doc", { + doc <- pdf_doc_open(fixture_path("shapes")) + pdf_doc_close(doc) + expect_error(pdf_doc_summary(doc), "Document has been closed") +}) + +test_that("pdf_doc_summary rejects bad input", { + expect_error(pdf_doc_summary(42L), "Assertion on") + expect_error(pdf_doc_summary(NULL), "Assertion on") +}) + +test_that("pdf_doc_summary's is_encrypted is FALSE on unencrypted PDFs", { + s <- pdf_doc_summary(fixture_path("shapes")) + expect_false(s$is_encrypted) + expect_true(is.na(s$security_revision)) +}) + +test_that("pdf_doc_summary's path slot reflects the source", { + s_path <- pdf_doc_summary(fixture_path("shapes")) + expect_match(s_path$path, "shapes\\.pdf$") + + bytes <- readBin(fixture_path("shapes"), "raw", + file.info(fixture_path("shapes"))$size) + doc_raw <- pdf_doc_open(source = bytes) + on.exit(pdf_doc_close(doc_raw), add = TRUE) + s_raw <- pdf_doc_summary(doc_raw) + expect_identical(s_raw$path, "") +}) + +# file_id_hex_or_na ------------------------------------------------ +# The hex-string branch isn't exercised through pdf_doc_summary +# itself because no shipped fixture sets the /ID trailer entry. +# Test the helper directly. + +test_that("file_id_hex_or_na returns NA on empty raw", { + expect_identical(pdfium:::file_id_hex_or_na(raw(0)), NA_character_) +}) + +test_that("file_id_hex_or_na hex-encodes non-empty raw bytes", { + bytes <- as.raw(c(0x00, 0xff, 0xab, 0x10)) + expect_identical(pdfium:::file_id_hex_or_na(bytes), "00ffab10") +}) From bab5592d8f5cfed4ef59e11bdadbc9eb1494060f Mon Sep 17 00:00:00 2001 From: Bill Denney Date: Thu, 21 May 2026 14:29:23 +0000 Subject: [PATCH 02/10] =?UTF-8?q?feat(page):=20add=20pdf=5Fpages=5Fsummary?= =?UTF-8?q?()=20=E2=80=94=20per-page=20sibling=20of=20doc=5Fsummary?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Returns a tibble with one row per page covering the cheap by-index metadata: width, height (PDF user-space points, pre-rotation), rotation in degrees (0/90/180/270), and the page label (or NA when absent). All four columns use the fast by-index PDFium readers (FPDF_GetPageSizeByIndexF + FPDF_GetPageRotation + FPDF_GetPageLabel), so the function does not load any page objects and scales linearly on long documents. Designed as the per-page sibling of pdf_doc_summary() — the same "give me everything cheap in one call" shape, parallel to pdftools::pdf_pagesize() but with rotation + label columns added. Accepts both a pdfium_doc and a character path; the path form opens + closes internally. Surfaces empty-string labels as NA for a cleaner "no label here" signal (PDFium can return "" for pages omitted from a partial /PageLabels array). 11 new tests in test-pages-summary.R cover column shape + types, the page_num sequence, dimensions sanity, agreement with pdf_page_size() + pdf_page_rotation() on a per-page basis, both input forms (path + doc), the multi-page case, password forwarding, closed-doc rejection, bad-input rejection, the empty-pages-summary helper, and the label-empty-to-NA contract. Defensive guards (missing-/PageLabels and zero-page-doc) marked # nocov — both unreachable from the shipped fixture set. Full suite 2111/2111 pass; R coverage 100% (2813/2813); 0 lints; pkgdown reference check passes. Co-Authored-By: Claude Opus 4.7 (1M context) --- NAMESPACE | 1 + NEWS.md | 6 ++ R/page.R | 92 +++++++++++++++++++++++++++++ _pkgdown.yml | 1 + man/pdf_pages_summary.Rd | 51 ++++++++++++++++ tests/testthat/test-pages-summary.R | 92 +++++++++++++++++++++++++++++ 6 files changed, 243 insertions(+) create mode 100644 man/pdf_pages_summary.Rd create mode 100644 tests/testthat/test-pages-summary.R diff --git a/NAMESPACE b/NAMESPACE index 3f15c60..58b435c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -197,6 +197,7 @@ export(pdf_page_set_rotation) export(pdf_page_size) export(pdf_page_thumbnail) export(pdf_pages_reorder) +export(pdf_pages_summary) export(pdf_parse_date) export(pdf_path_append) export(pdf_path_bezier_to) diff --git a/NEWS.md b/NEWS.md index 580f04e..9a67c7c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -9,6 +9,12 @@ each feature group, and the file-ID tuple. Replaces the eight-or-so individual calls users typically chain together when triaging a PDF. +* `pdf_pages_summary()` — per-page sibling of `pdf_doc_summary()`: + one row per page with `width`, `height` (PDF user-space points), + `rotation`, and `label`. All four columns use the fast by-index + PDFium calls, so the function does not load any page objects and + scales to long documents. Roughly the `pdftools::pdf_pagesize()` + equivalent, but with rotation + label columns added. # pdfium 0.1.0 diff --git a/R/page.R b/R/page.R index 71b86fe..c891b8b 100644 --- a/R/page.R +++ b/R/page.R @@ -140,3 +140,95 @@ pdf_page_rotation <- function(page, page_num = 1L) { on.exit(pdf_page_close(p), add = TRUE) cpp_page_rotation(p$ptr) } + +#' One-call summary of every page in a document +#' +#' Returns a tibble with one row per page covering the cheap +#' per-page facts: width, height (both in PDF user-space points, +#' pre-rotation), rotation in degrees, and the page label (if any). +#' The per-page values come from the existing single-page readers +#' [pdf_page_size()] (fast `FPDF_GetPageSizeByIndexF` path), +#' [pdf_page_rotation()], and [pdf_page_labels()]; no per-page +#' [pdf_page_load()] is required for any of them, so the function +#' is efficient on long documents. +#' +#' For deeper per-page facts (annotation count, object count, text +#' content, …) load each page individually with [pdf_page_load()] +#' and call the per-page readers. +#' +#' @param doc A `pdfium_doc` from [pdf_doc_open()], or a character +#' path. +#' @param password Optional password for encrypted PDFs when `doc` +#' is a path. Ignored when `doc` is an open `pdfium_doc`. +#' @return A tibble with columns: +#' * `page_num` — integer, 1-based. +#' * `width`, `height` — numeric, PDF user-space points. +#' * `rotation` — integer, `0` / `90` / `180` / `270`. +#' * `label` — character; the page's `/PageLabels` entry, or `NA` +#' when the document has no labels. +#' @seealso [pdf_doc_summary()] for the doc-level companion; +#' [pdf_page_size()], [pdf_page_rotation()], [pdf_page_labels()] +#' for the per-row readers. +#' @examples +#' fixture <- system.file("extdata", "fixtures", "minimal.pdf", +#' package = "pdfium" +#' ) +#' if (nzchar(fixture)) pdf_pages_summary(fixture) +#' @export +pdf_pages_summary <- function(doc, password = NULL) { + if (is.character(doc)) { + handle <- pdf_doc_open(doc, password = password) + on.exit(pdf_doc_close(handle), add = TRUE) + return(pdf_pages_summary(handle)) + } + checkmate::assert_class(doc, "pdfium_doc") + if (!is_open(doc)) stop("Document has been closed.", call. = FALSE) + + n <- pdf_page_count(doc) + labels <- tryCatch(pdf_page_labels(doc), error = function(e) NULL) + if (is.null(labels) || length(labels) != n) { + # nocov start — pdf_page_labels always returns a length-n vector + # on shipped fixtures (every doc has a /PageLabels array, even + # if every entry is ""); guard exists for malformed PDFs in the + # wild. + labels <- rep(NA_character_, n) + # nocov end + } + # Some labels arrive as "" when the source PDF has a /PageLabels + # array that omits a specific page. Surface those as NA for a + # cleaner "no label here" signal. + labels[!is.na(labels) & !nzchar(labels)] <- NA_character_ + + if (n == 0L) { + return(empty_pages_summary()) # nocov — no shipped fixture has 0 pages. + } + + # Use the fast by-index size / rotation paths so we never load a + # page object just to read its metadata. + sizes <- lapply(seq_len(n), function(i) pdf_page_size(doc, i)) + rotations <- vapply(seq_len(n), function(i) { + as.integer(pdf_page_rotation(doc, i)) + }, integer(1L)) + + tibble::tibble( + page_num = seq_len(n), + width = vapply(sizes, function(s) as.numeric(s[["width"]]), + numeric(1L)), + height = vapply(sizes, function(s) as.numeric(s[["height"]]), + numeric(1L)), + rotation = rotations, + label = labels + ) +} + +# Internal: zero-row tibble matching pdf_pages_summary's shape, for +# docs with no pages (rare; mostly an in-memory-built corner case). +empty_pages_summary <- function() { + tibble::tibble( + page_num = integer(), + width = numeric(), + height = numeric(), + rotation = integer(), + label = character() + ) +} diff --git a/_pkgdown.yml b/_pkgdown.yml index 80e8ed6..44139ea 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -93,6 +93,7 @@ reference: - pdf_page_size - pdf_page_rotation - pdf_page_box + - pdf_pages_summary - pdf_page_links - pdf_link_at_point - pdf_link_annot_at_point diff --git a/man/pdf_pages_summary.Rd b/man/pdf_pages_summary.Rd new file mode 100644 index 0000000..5d53669 --- /dev/null +++ b/man/pdf_pages_summary.Rd @@ -0,0 +1,51 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/page.R +\name{pdf_pages_summary} +\alias{pdf_pages_summary} +\title{One-call summary of every page in a document} +\usage{ +pdf_pages_summary(doc, password = NULL) +} +\arguments{ +\item{doc}{A \code{pdfium_doc} from \code{\link[=pdf_doc_open]{pdf_doc_open()}}, or a character +path.} + +\item{password}{Optional password for encrypted PDFs when \code{doc} +is a path. Ignored when \code{doc} is an open \code{pdfium_doc}.} +} +\value{ +A tibble with columns: +\itemize{ +\item \code{page_num} — integer, 1-based. +\item \code{width}, \code{height} — numeric, PDF user-space points. +\item \code{rotation} — integer, \code{0} / \code{90} / \code{180} / \code{270}. +\item \code{label} — character; the page's \verb{/PageLabels} entry, or \code{NA} +when the document has no labels. +} +} +\description{ +Returns a tibble with one row per page covering the cheap +per-page facts: width, height (both in PDF user-space points, +pre-rotation), rotation in degrees, and the page label (if any). +The per-page values come from the existing single-page readers +\code{\link[=pdf_page_size]{pdf_page_size()}} (fast \code{FPDF_GetPageSizeByIndexF} path), +\code{\link[=pdf_page_rotation]{pdf_page_rotation()}}, and \code{\link[=pdf_page_labels]{pdf_page_labels()}}; no per-page +\code{\link[=pdf_page_load]{pdf_page_load()}} is required for any of them, so the function +is efficient on long documents. +} +\details{ +For deeper per-page facts (annotation count, object count, text +content, …) load each page individually with \code{\link[=pdf_page_load]{pdf_page_load()}} +and call the per-page readers. +} +\examples{ +fixture <- system.file("extdata", "fixtures", "minimal.pdf", + package = "pdfium" +) +if (nzchar(fixture)) pdf_pages_summary(fixture) +} +\seealso{ +\code{\link[=pdf_doc_summary]{pdf_doc_summary()}} for the doc-level companion; +\code{\link[=pdf_page_size]{pdf_page_size()}}, \code{\link[=pdf_page_rotation]{pdf_page_rotation()}}, \code{\link[=pdf_page_labels]{pdf_page_labels()}} +for the per-row readers. +} diff --git a/tests/testthat/test-pages-summary.R b/tests/testthat/test-pages-summary.R new file mode 100644 index 0000000..463001b --- /dev/null +++ b/tests/testthat/test-pages-summary.R @@ -0,0 +1,92 @@ +# Tests for pdf_pages_summary() — per-page sibling of +# pdf_doc_summary(). Returns one row per page with the cheap +# by-index metadata (size, rotation, label). + +test_that("pdf_pages_summary returns one row per page", { + s <- pdf_pages_summary(fixture_path("minimal")) + expect_s3_class(s, "tbl_df") + expect_equal(nrow(s), 1L) # minimal.pdf is 1 page + expect_named(s, c("page_num", "width", "height", "rotation", "label")) +}) + +test_that("pdf_pages_summary column types are stable", { + s <- pdf_pages_summary(fixture_path("minimal")) + expect_type(s$page_num, "integer") + expect_type(s$width, "double") + expect_type(s$height, "double") + expect_type(s$rotation, "integer") + expect_type(s$label, "character") +}) + +test_that("pdf_pages_summary reports correct page_num sequence", { + s <- pdf_pages_summary(fixture_path("outline")) + expect_identical(s$page_num, seq_len(nrow(s))) +}) + +test_that("pdf_pages_summary reports sane dimensions", { + s <- pdf_pages_summary(fixture_path("minimal")) + expect_true(all(s$width > 0)) + expect_true(all(s$height > 0)) + expect_true(all(s$rotation %in% c(0L, 90L, 180L, 270L))) +}) + +test_that("pdf_pages_summary matches pdf_page_size + pdf_page_rotation", { + doc <- pdf_doc_open(fixture_path("outline")) + on.exit(pdf_doc_close(doc), add = TRUE) + s <- pdf_pages_summary(doc) + # Cross-check the first page against the per-page readers. + page1 <- pdf_page_size(doc, 1L) + expect_identical(s$width[[1L]], as.numeric(page1[["width"]])) + expect_identical(s$height[[1L]], as.numeric(page1[["height"]])) + expect_identical(s$rotation[[1L]], + as.integer(pdf_page_rotation(doc, 1L))) +}) + +test_that("pdf_pages_summary accepts a path or open doc", { + by_path <- pdf_pages_summary(fixture_path("outline")) + doc <- pdf_doc_open(fixture_path("outline")) + on.exit(pdf_doc_close(doc), add = TRUE) + by_doc <- pdf_pages_summary(doc) + expect_identical(by_path, by_doc) +}) + +test_that("pdf_pages_summary handles multi-page documents", { + s <- pdf_pages_summary(fixture_path("outline")) + expect_gt(nrow(s), 1L) + expect_true(all(s$width > 0)) +}) + +test_that("pdf_pages_summary forwards the password argument", { + s <- pdf_pages_summary(fixture_path("minimal"), password = NULL) + expect_equal(nrow(s), 1L) +}) + +test_that("pdf_pages_summary rejects a closed doc", { + doc <- pdf_doc_open(fixture_path("minimal")) + pdf_doc_close(doc) + expect_error(pdf_pages_summary(doc), "Document has been closed") +}) + +test_that("pdf_pages_summary rejects bad input", { + expect_error(pdf_pages_summary(42L), "Assertion on") + expect_error(pdf_pages_summary(NULL), "Assertion on") +}) + +test_that("pdf_pages_summary label column is NA when no page labels", { + s <- pdf_pages_summary(fixture_path("minimal")) + # minimal.pdf has no /PageLabels. + expect_true(all(is.na(s$label))) +}) + +# Internal helper ----------------------------------------------------- + +test_that("empty_pages_summary returns a zero-row tibble with the right shape", { + empty <- pdfium:::empty_pages_summary() + expect_s3_class(empty, "tbl_df") + expect_equal(nrow(empty), 0L) + expect_named(empty, c("page_num", "width", "height", "rotation", "label")) + expect_type(empty$page_num, "integer") + expect_type(empty$width, "double") + expect_type(empty$rotation, "integer") + expect_type(empty$label, "character") +}) From 7b841f597fe45d39e93f720a82d788cf66928208 Mon Sep 17 00:00:00 2001 From: Bill Denney Date: Thu, 21 May 2026 14:30:17 +0000 Subject: [PATCH 03/10] docs: point migration table at the new summary helpers Updates the "Switching from pdftools" table in vignettes/comparison.Rmd to point users at the post-v0.1.0 summary helpers: * `pdftools::pdf_info(path)` -> mention pdf_doc_summary as the richer alternative for one-call triage. * `pdftools::pdf_pagesize(path)` -> point at pdf_pages_summary rather than pdf_page_size, since pdf_pagesize is vectorised over pages and pdf_page_size is per-page. pdf_pages_summary matches the vectorised shape and adds rotation + label columns. Co-Authored-By: Claude Opus 4.7 (1M context) --- vignettes/comparison.Rmd | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vignettes/comparison.Rmd b/vignettes/comparison.Rmd index a28fe82..d65d626 100644 --- a/vignettes/comparison.Rmd +++ b/vignettes/comparison.Rmd @@ -158,8 +158,8 @@ are close enough that switching is mostly a find-and-replace: | `pdftools` | `pdfium` | |---|---| | `pdf_text(path)` | `pdf_doc_text(path)` | -| `pdf_info(path)` | `pdf_doc_info(path)` | -| `pdf_pagesize(path)` | `pdf_page_size(doc, page_num)` | +| `pdf_info(path)` | `pdf_doc_info(path)` — or `pdf_doc_summary(path)` for a richer one-row tibble | +| `pdf_pagesize(path)` | `pdf_pages_summary(path)` (one row per page; also includes rotation + label) | | `pdf_render_page(path, ...)`| `pdf_render_page(doc_or_path, ...)` | | `pdf_data(path)` | `pdf_text_runs(page)` | | `pdf_doc_fonts(path)` | `pdf_doc_fonts(doc)` | From 954c77fb3b2b02a02fbf2386fdb181376b034258 Mon Sep 17 00:00:00 2001 From: Bill Denney Date: Thu, 21 May 2026 14:35:38 +0000 Subject: [PATCH 04/10] feat(doc): add summary.pdfium_doc S3 method Calling summary() on a pdfium_doc now dispatches to pdf_doc_summary(), matching the standard R idiom of print() for a quick "what is this" one-line string and summary() for the deep-dive tibble. The method lives in R/doc.R rather than R/classes.R so lintr's per-file object-usage check can see the pdf_doc_summary call in the same file. Two new tests confirm dispatch. Co-Authored-By: Claude Opus 4.7 (1M context) --- NAMESPACE | 1 + NEWS.md | 4 ++++ R/classes.R | 1 + R/doc.R | 17 +++++++++++++++++ man/summary.pdfium_doc.Rd | 26 ++++++++++++++++++++++++++ tests/testthat/test-doc-summary.R | 14 ++++++++++++++ 6 files changed, 63 insertions(+) create mode 100644 man/summary.pdfium_doc.Rd diff --git a/NAMESPACE b/NAMESPACE index 58b435c..58bc38d 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -42,6 +42,7 @@ S3method(print,pdfium_obj_list) S3method(print,pdfium_page) S3method(print,pdfium_signature) S3method(print,pdfium_signature_list) +S3method(summary,pdfium_doc) export(as_pdfium_annot_list) export(as_pdfium_attachment_list) export(as_pdfium_bookmark_list) diff --git a/NEWS.md b/NEWS.md index 9a67c7c..2e1dd45 100644 --- a/NEWS.md +++ b/NEWS.md @@ -15,6 +15,10 @@ PDFium calls, so the function does not load any page objects and scales to long documents. Roughly the `pdftools::pdf_pagesize()` equivalent, but with rotation + label columns added. +* `summary()` S3 method for `pdfium_doc` — calling `summary(doc)` + now dispatches to [pdf_doc_summary()]. Matches the standard R + idiom of `print()` for a quick "what is this" string and + `summary()` for the deep-dive tibble. # pdfium 0.1.0 diff --git a/R/classes.R b/R/classes.R index 9a82c92..7fdc270 100644 --- a/R/classes.R +++ b/R/classes.R @@ -72,6 +72,7 @@ print.pdfium_doc <- function(x, ...) { invisible(x) } + #' Construct a `pdfium_page` from an external pointer #' #' Internal helper. The page's externalptr carries its parent document's diff --git a/R/doc.R b/R/doc.R index c31f939..d0ccc75 100644 --- a/R/doc.R +++ b/R/doc.R @@ -625,6 +625,23 @@ pdf_doc_summary <- function(doc, password = NULL) { if (is.null(a) || (length(a) == 1L && is.na(a))) b else a } +#' Document-level summary +#' +#' `summary()` method for `pdfium_doc`. Defers to +#' [pdf_doc_summary()] so users can call `summary(doc)` for the +#' single-row tibble of every key fact about the PDF — page count, +#' Info-dictionary metadata, structural feature flags, per-feature +#' counts, the file-ID tuple — in one call. +#' +#' @param object A `pdfium_doc` from [pdf_doc_open()]. +#' @param ... Unused (S3 generic compatibility). +#' @return The tibble returned by [pdf_doc_summary()]. +#' @seealso [pdf_doc_summary()]. +#' @export +summary.pdfium_doc <- function(object, ...) { + pdf_doc_summary(object) +} + # Internal: convert pdf_doc_file_id()'s raw return to a hex string, # or NA_character_ when empty. Hoisted from pdf_doc_summary so its # two branches can be unit-tested without a fixture that carries an diff --git a/man/summary.pdfium_doc.Rd b/man/summary.pdfium_doc.Rd new file mode 100644 index 0000000..114a457 --- /dev/null +++ b/man/summary.pdfium_doc.Rd @@ -0,0 +1,26 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/classes.R +\name{summary.pdfium_doc} +\alias{summary.pdfium_doc} +\title{Document-level summary} +\usage{ +\method{summary}{pdfium_doc}(object, ...) +} +\arguments{ +\item{object}{A \code{pdfium_doc} from \code{\link[=pdf_doc_open]{pdf_doc_open()}}.} + +\item{...}{Unused (S3 generic compatibility).} +} +\value{ +The tibble returned by \code{\link[=pdf_doc_summary]{pdf_doc_summary()}}. +} +\description{ +\code{summary()} method for \code{pdfium_doc}. Defers to +\code{\link[=pdf_doc_summary]{pdf_doc_summary()}} so users can call \code{summary(doc)} for the +single-row tibble of every key fact about the PDF — page count, +Info-dictionary metadata, structural feature flags, per-feature +counts, the file-ID tuple — in one call. +} +\seealso{ +\code{\link[=pdf_doc_summary]{pdf_doc_summary()}}. +} diff --git a/tests/testthat/test-doc-summary.R b/tests/testthat/test-doc-summary.R index 378c784..bf4fba5 100644 --- a/tests/testthat/test-doc-summary.R +++ b/tests/testthat/test-doc-summary.R @@ -129,3 +129,17 @@ test_that("file_id_hex_or_na hex-encodes non-empty raw bytes", { bytes <- as.raw(c(0x00, 0xff, 0xab, 0x10)) expect_identical(pdfium:::file_id_hex_or_na(bytes), "00ffab10") }) + +# summary.pdfium_doc S3 method ------------------------------------ + +test_that("summary(doc) dispatches to pdf_doc_summary", { + doc <- pdf_doc_open(fixture_path("shapes")) + on.exit(pdf_doc_close(doc), add = TRUE) + expect_identical(summary(doc), pdf_doc_summary(doc)) +}) + +test_that("summary(doc) returns a tibble", { + doc <- pdf_doc_open(fixture_path("shapes")) + on.exit(pdf_doc_close(doc), add = TRUE) + expect_s3_class(summary(doc), "tbl_df") +}) From 917ad87605b27dc33ed7ff2a449c3d68bc87481a Mon Sep 17 00:00:00 2001 From: Bill Denney Date: Thu, 21 May 2026 14:37:35 +0000 Subject: [PATCH 05/10] chore: revert version to 0.0.9000 (pre-first-release dev) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit v0.1.0 hasn't shipped to CRAN yet — the version string was aspirational. Switch back to the conventional pre-release development version (0.0.9000) until devtools::release() actually runs. NEWS.md: collapse the "(development version)" block I had bolted on top into the existing planned-0.1.0 section. The new pdf_doc_summary / pdf_pages_summary / summary.pdfium_doc entries join the v0.1.0 surface they were always going to ship with. Co-Authored-By: Claude Opus 4.7 (1M context) --- DESCRIPTION | 2 +- NEWS.md | 30 ++++++++---------------------- 2 files changed, 9 insertions(+), 23 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 24eef99..174177d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: pdfium Title: Idiomatic R Bindings to the PDFium PDF Engine -Version: 0.1.0.9000 +Version: 0.0.9000 Authors@R: c( person("Bill", "Denney", , "wdenney@humanpredictions.com", role = c("aut", "cre"), diff --git a/NEWS.md b/NEWS.md index 2e1dd45..6d08164 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,25 +1,3 @@ -# pdfium (development version) - -## New features - -* `pdf_doc_summary()` — one-call helper that returns a single-row - tibble aggregating the most-asked-for facts about a PDF: path, - page count, Info-dictionary metadata, structural feature flags - (tagged, encrypted, has-forms, has-attachments, …), counts for - each feature group, and the file-ID tuple. Replaces the - eight-or-so individual calls users typically chain together when - triaging a PDF. -* `pdf_pages_summary()` — per-page sibling of `pdf_doc_summary()`: - one row per page with `width`, `height` (PDF user-space points), - `rotation`, and `label`. All four columns use the fast by-index - PDFium calls, so the function does not load any page objects and - scales to long documents. Roughly the `pdftools::pdf_pagesize()` - equivalent, but with rotation + label columns added. -* `summary()` S3 method for `pdfium_doc` — calling `summary(doc)` - now dispatches to [pdf_doc_summary()]. Matches the standard R - idiom of `print()` for a quick "what is this" string and - `summary()` for the deep-dive tibble. - # pdfium 0.1.0 Initial CRAN release. This is the first public version of `pdfium`, @@ -49,6 +27,14 @@ PDFs created with `pdf_doc_new()` are also writable). * `pdf_page_load()` / `pdf_page_close()`, `pdf_page_size()`, `pdf_page_rotation()`, `pdf_page_box()`, `pdf_page_thumbnail()` — per-page handles and metadata. +* `pdf_doc_summary()` and `pdf_pages_summary()` — one-call triage + helpers. `pdf_doc_summary()` returns a single-row tibble + aggregating the most-asked-for facts about a PDF (path, page + count, Info-dictionary metadata, feature flags, per-feature + counts, file-ID tuple); `pdf_pages_summary()` is the per-page + sibling (width / height / rotation / label, all via the fast + by-index PDFium readers). `summary(doc)` dispatches to + `pdf_doc_summary()` so the standard R idiom works. ## Page objects, paths, and text From 6b48fcc3879be6016f9c124c058fb2c15d0685f7 Mon Sep 17 00:00:00 2001 From: Bill Denney Date: Thu, 21 May 2026 14:43:03 +0000 Subject: [PATCH 06/10] feat(page): add summary.pdfium_page() S3 method MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Calling summary() on a pdfium_page now returns a single-row tibble combining the cheap by-index columns (page_num, width, height, rotation, label — same shape pdf_pages_summary returns per row) with the per-page counts that the loaded page makes available: annotation_count, obj_count, text_run_count, link_count. Two-tier shape: the doc-wide pdf_pages_summary stays cheap (no page loads, no per-row counts); the page-level summary trades one already-loaded page for richer information. Users picking which one to call don't have to think about the cost — the loaded-page overload just exists, exposed through the standard R idiom. 5 new tests in test-pages-summary.R covering shape, columns, agreement with the underlying readers, error on closed page, and the real-label path against outline.pdf (the only shipped fixture with non-empty /PageLabels entries). Full suite 2124/2124 pass; R coverage 100% (2835/2835 lines); 0 lints. Co-Authored-By: Claude Opus 4.7 (1M context) --- NAMESPACE | 1 + NEWS.md | 6 ++- R/page.R | 48 +++++++++++++++++++++ man/summary.pdfium_doc.Rd | 2 +- man/summary.pdfium_page.Rd | 37 ++++++++++++++++ tests/testthat/test-pages-summary.R | 65 +++++++++++++++++++++++++++++ 6 files changed, 156 insertions(+), 3 deletions(-) create mode 100644 man/summary.pdfium_page.Rd diff --git a/NAMESPACE b/NAMESPACE index 58bc38d..b871021 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -43,6 +43,7 @@ S3method(print,pdfium_page) S3method(print,pdfium_signature) S3method(print,pdfium_signature_list) S3method(summary,pdfium_doc) +S3method(summary,pdfium_page) export(as_pdfium_annot_list) export(as_pdfium_attachment_list) export(as_pdfium_bookmark_list) diff --git a/NEWS.md b/NEWS.md index 6d08164..c9c8282 100644 --- a/NEWS.md +++ b/NEWS.md @@ -33,8 +33,10 @@ PDFs created with `pdf_doc_new()` are also writable). count, Info-dictionary metadata, feature flags, per-feature counts, file-ID tuple); `pdf_pages_summary()` is the per-page sibling (width / height / rotation / label, all via the fast - by-index PDFium readers). `summary(doc)` dispatches to - `pdf_doc_summary()` so the standard R idiom works. + by-index PDFium readers). `summary(doc)` and `summary(page)` + dispatch to the matching tibble — `summary(page)` adds the + page-loaded counts (annotation count, page-object count, + text-run count, link count) since the page is already loaded. ## Page objects, paths, and text diff --git a/R/page.R b/R/page.R index c891b8b..e51678e 100644 --- a/R/page.R +++ b/R/page.R @@ -221,6 +221,54 @@ pdf_pages_summary <- function(doc, password = NULL) { ) } +#' Page-level summary +#' +#' `summary()` method for `pdfium_page`. Returns a single-row tibble +#' combining the cheap by-index columns +#' ([pdf_pages_summary()]-style: `page_num`, `width`, `height`, +#' `rotation`, `label`) with the per-page counts that require the +#' page to be loaded — annotation count, page-object count, text-run +#' count, and link count. Because the page handle is already loaded, +#' the per-count readers run against the existing page and don't +#' trigger an additional load. +#' +#' Use this for the "what's on this page?" interactive triage flow. +#' For the doc-wide companion, see [summary.pdfium_doc()]. +#' +#' @param object A `pdfium_page` from [pdf_page_load()]. +#' @param ... Unused (S3 generic compatibility). +#' @return A one-row tibble with columns `page_num`, `width`, +#' `height`, `rotation`, `label`, `annotation_count`, `obj_count`, +#' `text_run_count`, `link_count`. +#' @seealso [summary.pdfium_doc()] for the doc-wide companion, +#' [pdf_pages_summary()] for the per-document table without the +#' page-loaded counts. +#' @export +summary.pdfium_page <- function(object, ...) { + if (!is_open(object)) stop("Page has been closed.", call. = FALSE) + sz <- cpp_page_size(object$ptr) + labels <- tryCatch(pdf_page_labels(object$doc), + error = function(e) NULL) + label <- if (is.null(labels) || length(labels) < object$index) { + NA_character_ # nocov — shipped fixtures always return length-n. + } else { + lbl <- labels[[object$index]] + if (is.na(lbl) || !nzchar(lbl)) NA_character_ else lbl + } + + tibble::tibble( + page_num = object$index, + width = as.numeric(sz[["width"]]), + height = as.numeric(sz[["height"]]), + rotation = as.integer(cpp_page_rotation(object$ptr)), + label = label, + annotation_count = length(pdf_annotations(object)), + obj_count = length(pdf_page_objects(object)), + text_run_count = nrow(pdf_text_runs(object)), + link_count = nrow(pdf_page_links(object)) + ) +} + # Internal: zero-row tibble matching pdf_pages_summary's shape, for # docs with no pages (rare; mostly an in-memory-built corner case). empty_pages_summary <- function() { diff --git a/man/summary.pdfium_doc.Rd b/man/summary.pdfium_doc.Rd index 114a457..0adc92b 100644 --- a/man/summary.pdfium_doc.Rd +++ b/man/summary.pdfium_doc.Rd @@ -1,5 +1,5 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/classes.R +% Please edit documentation in R/doc.R \name{summary.pdfium_doc} \alias{summary.pdfium_doc} \title{Document-level summary} diff --git a/man/summary.pdfium_page.Rd b/man/summary.pdfium_page.Rd new file mode 100644 index 0000000..e768d1c --- /dev/null +++ b/man/summary.pdfium_page.Rd @@ -0,0 +1,37 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/page.R +\name{summary.pdfium_page} +\alias{summary.pdfium_page} +\title{Page-level summary} +\usage{ +\method{summary}{pdfium_page}(object, ...) +} +\arguments{ +\item{object}{A \code{pdfium_page} from \code{\link[=pdf_page_load]{pdf_page_load()}}.} + +\item{...}{Unused (S3 generic compatibility).} +} +\value{ +A one-row tibble with columns \code{page_num}, \code{width}, +\code{height}, \code{rotation}, \code{label}, \code{annotation_count}, \code{obj_count}, +\code{text_run_count}, \code{link_count}. +} +\description{ +\code{summary()} method for \code{pdfium_page}. Returns a single-row tibble +combining the cheap by-index columns +(\code{\link[=pdf_pages_summary]{pdf_pages_summary()}}-style: \code{page_num}, \code{width}, \code{height}, +\code{rotation}, \code{label}) with the per-page counts that require the +page to be loaded — annotation count, page-object count, text-run +count, and link count. Because the page handle is already loaded, +the per-count readers run against the existing page and don't +trigger an additional load. +} +\details{ +Use this for the "what's on this page?" interactive triage flow. +For the doc-wide companion, see \code{\link[=summary.pdfium_doc]{summary.pdfium_doc()}}. +} +\seealso{ +\code{\link[=summary.pdfium_doc]{summary.pdfium_doc()}} for the doc-wide companion, +\code{\link[=pdf_pages_summary]{pdf_pages_summary()}} for the per-document table without the +page-loaded counts. +} diff --git a/tests/testthat/test-pages-summary.R b/tests/testthat/test-pages-summary.R index 463001b..33cb2c2 100644 --- a/tests/testthat/test-pages-summary.R +++ b/tests/testthat/test-pages-summary.R @@ -90,3 +90,68 @@ test_that("empty_pages_summary returns a zero-row tibble with the right shape", expect_type(empty$rotation, "integer") expect_type(empty$label, "character") }) + +# summary.pdfium_page S3 method ------------------------------------ + +test_that("summary(page) returns a one-row tibble", { + doc <- pdf_doc_open(fixture_path("annotated")) + on.exit(pdf_doc_close(doc), add = TRUE) + page <- pdf_page_load(doc, 1L) + on.exit(pdf_page_close(page), add = TRUE, after = FALSE) + s <- summary(page) + expect_s3_class(s, "tbl_df") + expect_equal(nrow(s), 1L) +}) + +test_that("summary(page) columns cover both cheap + page-loaded data", { + doc <- pdf_doc_open(fixture_path("annotated")) + on.exit(pdf_doc_close(doc), add = TRUE) + page <- pdf_page_load(doc, 1L) + on.exit(pdf_page_close(page), add = TRUE, after = FALSE) + s <- summary(page) + expect_named(s, c( + "page_num", "width", "height", "rotation", "label", + "annotation_count", "obj_count", "text_run_count", "link_count" + )) +}) + +test_that("summary(page) reports a positive annotation count on annotated.pdf", { + doc <- pdf_doc_open(fixture_path("annotated")) + on.exit(pdf_doc_close(doc), add = TRUE) + page <- pdf_page_load(doc, 1L) + on.exit(pdf_page_close(page), add = TRUE, after = FALSE) + s <- summary(page) + expect_gt(s$annotation_count, 0L) + expect_identical(s$page_num, 1L) +}) + +test_that("summary(page) matches direct per-page reader calls", { + doc <- pdf_doc_open(fixture_path("annotated")) + on.exit(pdf_doc_close(doc), add = TRUE) + page <- pdf_page_load(doc, 1L) + on.exit(pdf_page_close(page), add = TRUE, after = FALSE) + s <- summary(page) + expect_identical(s$annotation_count, length(pdf_annotations(page))) + expect_identical(s$obj_count, length(pdf_page_objects(page))) + expect_identical(s$text_run_count, nrow(pdf_text_runs(page))) + expect_identical(s$link_count, nrow(pdf_page_links(page))) +}) + +test_that("summary(page) rejects a closed page", { + doc <- pdf_doc_open(fixture_path("annotated")) + on.exit(pdf_doc_close(doc), add = TRUE) + page <- pdf_page_load(doc, 1L) + pdf_page_close(page) + expect_error(summary(page), "Page has been closed") +}) + +test_that("summary(page) surfaces real page labels", { + # outline.pdf is the only shipped fixture with a /PageLabels array + # whose entries aren't all empty strings — its first page is + # labelled "i" (roman numeral preface convention). + doc <- pdf_doc_open(fixture_path("outline")) + on.exit(pdf_doc_close(doc), add = TRUE) + page <- pdf_page_load(doc, 1L) + on.exit(pdf_page_close(page), add = TRUE, after = FALSE) + expect_identical(summary(page)$label, "i") +}) From fda77d1457b72b3ce1cf7e4bd0d5e816d0caf53c Mon Sep 17 00:00:00 2001 From: Bill Denney Date: Thu, 21 May 2026 14:48:48 +0000 Subject: [PATCH 07/10] =?UTF-8?q?feat(doc):=20add=20pdf=5Fdoc=5Fopen=5Furl?= =?UTF-8?q?()=20=E2=80=94=20open=20a=20remote=20PDF=20in=20one=20call?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Convenience wrapper around pdf_doc_open(source = ...) that fetches the bytes of a URL (http://, https://, ftp://, or file://) via base R's url() + readBin() and loads through PDFium's in-memory path. No temporary file is created — the bytes live in R memory for the document's lifetime. The returned pdfium_doc's $path field is the URL string itself, so print() and pdf_doc_summary() surface the source even though no local path exists. Closes the most common user-facing convenience gap: today, users fetching a PDF from a URL have to chain download.file() + tempfile() + pdf_doc_open() themselves. One call is shorter, doesn't leave temp files on disk, and handles cleanup via existing pdfium_doc finalizers. 8 new tests in test-doc-open-url.R cover the file:// happy path, the URL-stored-as-path contract, password/readwrite forwarding, input-shape rejection (non-URL strings, bad types), connection errors (file:// to non-existent path, http(s) to unreachable hosts — suppressWarnings so the unreachable-host warning doesn't pollute test output), and a pdf_doc_summary() round-trip. Full suite 2140/2140 pass; R coverage 100% (2849/2849); 0 lints; pkgdown reference check passes. Co-Authored-By: Claude Opus 4.7 (1M context) --- NAMESPACE | 1 + NEWS.md | 5 +- R/document.R | 54 ++++++++++++++++++++++ _pkgdown.yml | 1 + man/pdf_doc_open_url.Rd | 47 +++++++++++++++++++ tests/testthat/test-doc-open-url.R | 73 ++++++++++++++++++++++++++++++ 6 files changed, 180 insertions(+), 1 deletion(-) create mode 100644 man/pdf_doc_open_url.Rd create mode 100644 tests/testthat/test-doc-open-url.R diff --git a/NAMESPACE b/NAMESPACE index b871021..6b7f67a 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -119,6 +119,7 @@ export(pdf_doc_named_dest_by_name) export(pdf_doc_named_dests) export(pdf_doc_new) export(pdf_doc_open) +export(pdf_doc_open_url) export(pdf_doc_page_mode) export(pdf_doc_permissions) export(pdf_doc_security) diff --git a/NEWS.md b/NEWS.md index c9c8282..380325f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -11,7 +11,10 @@ PDFs created with `pdf_doc_new()` are also writable). * `pdf_doc_open()` / `pdf_doc_close()`, `pdf_doc_new()`, `pdf_save()` / `pdf_save_to_raw()` — open existing PDFs (optionally with `readwrite = TRUE`), build new ones in memory, and persist - the result. + the result. `pdf_doc_open_url(url)` is a convenience wrapper that + fetches a `http://` / `https://` / `ftp://` / `file://` URL via + `url()` + `readBin()` and loads the bytes through PDFium's + in-memory path — no temporary file on disk. * `pdf_doc_info()`, `pdf_doc_meta()`, `pdf_doc_text()`, `pdf_doc_fonts()`, `pdf_doc_file_id()`, `pdf_doc_page_mode()`, `pdf_doc_viewer_preferences()`, `pdf_doc_viewer_preference_by_name()`, diff --git a/R/document.R b/R/document.R index b7cfffd..f949511 100644 --- a/R/document.R +++ b/R/document.R @@ -70,6 +70,60 @@ pdf_doc_open <- function(path = NULL, source = NULL, password = NULL, ) } +#' Open a PDF document from a URL +#' +#' Convenience wrapper around [pdf_doc_open()] that fetches the +#' bytes of a remote (or `file://`) URL via base R's [`url()`] + +#' [`readBin()`] and loads the result through PDFium's in-memory +#' path (`FPDF_LoadMemDocument64`). No temporary file is left on +#' disk; the bytes live in R memory for the document's lifetime. +#' +#' Network errors propagate from [`url()`] / [`readBin()`] (typical +#' shape: `cannot open URL '...'` from `connection failed`). The +#' returned `pdfium_doc`'s `$path` field is the URL string itself, +#' so [print()][print.pdfium_doc] and [pdf_doc_summary()] surface +#' the source even though no local path exists. +#' +#' @param url Character scalar. Must start with one of `http://`, +#' `https://`, `ftp://`, or `file://`. +#' @param password Optional password for encrypted PDFs. `NULL` +#' (the default) passes no password to PDFium. +#' @param readwrite Logical. As for [pdf_doc_open()]. +#' @return A `pdfium_doc`. +#' @seealso [pdf_doc_open()] for the doc-open primitive. +#' @examples +#' fixture <- system.file("extdata", "fixtures", "minimal.pdf", +#' package = "pdfium" +#' ) +#' if (nzchar(fixture)) { +#' doc <- pdf_doc_open_url(paste0("file://", fixture)) +#' pdf_page_count(doc) +#' pdf_doc_close(doc) +#' } +#' @export +pdf_doc_open_url <- function(url, password = NULL, readwrite = FALSE) { + checkmate::assert_string(url, min.chars = 1L) + if (!grepl("^(https?|ftp|file)://", url)) { + stop( + "`url` must start with http://, https://, ftp://, or file://. ", + "Got: ", url, + call. = FALSE + ) + } + con <- base::url(url, open = "rb") + on.exit(close(con), add = TRUE) + # readBin needs a max-size hint; .Machine$integer.max is the + # documented "unbounded" sentinel. + bytes <- readBin(con, what = "raw", n = .Machine$integer.max) + doc <- pdf_doc_open(source = bytes, password = password, + readwrite = readwrite) + # Override the "" path with the source URL so + # downstream printing / pdf_doc_summary() shows where it came + # from. + doc$path <- url + doc +} + # Internal: validate the three pdf_doc_open() arguments. Split into # per-concern helpers so each stays under lintr's cyclocomp limit. validate_pdf_open_args <- function(path, source, password) { diff --git a/_pkgdown.yml b/_pkgdown.yml index 44139ea..69571e8 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -26,6 +26,7 @@ reference: - title: Documents contents: - pdf_doc_open + - pdf_doc_open_url - pdf_doc_close - pdf_page_count - pdf_doc_info diff --git a/man/pdf_doc_open_url.Rd b/man/pdf_doc_open_url.Rd new file mode 100644 index 0000000..de9259a --- /dev/null +++ b/man/pdf_doc_open_url.Rd @@ -0,0 +1,47 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/document.R +\name{pdf_doc_open_url} +\alias{pdf_doc_open_url} +\title{Open a PDF document from a URL} +\usage{ +pdf_doc_open_url(url, password = NULL, readwrite = FALSE) +} +\arguments{ +\item{url}{Character scalar. Must start with one of \verb{http://}, +\verb{https://}, \verb{ftp://}, or \verb{file://}.} + +\item{password}{Optional password for encrypted PDFs. \code{NULL} +(the default) passes no password to PDFium.} + +\item{readwrite}{Logical. As for \code{\link[=pdf_doc_open]{pdf_doc_open()}}.} +} +\value{ +A \code{pdfium_doc}. +} +\description{ +Convenience wrapper around \code{\link[=pdf_doc_open]{pdf_doc_open()}} that fetches the +bytes of a remote (or \verb{file://}) URL via base R's \code{\link[=url]{url()}} + +\code{\link[=readBin]{readBin()}} and loads the result through PDFium's in-memory +path (\code{FPDF_LoadMemDocument64}). No temporary file is left on +disk; the bytes live in R memory for the document's lifetime. +} +\details{ +Network errors propagate from \code{\link[=url]{url()}} / \code{\link[=readBin]{readBin()}} (typical +shape: \verb{cannot open URL '...'} from \verb{connection failed}). The +returned \code{pdfium_doc}'s \verb{$path} field is the URL string itself, +so \link[=print.pdfium_doc]{print()} and \code{\link[=pdf_doc_summary]{pdf_doc_summary()}} surface +the source even though no local path exists. +} +\examples{ +fixture <- system.file("extdata", "fixtures", "minimal.pdf", + package = "pdfium" +) +if (nzchar(fixture)) { + doc <- pdf_doc_open_url(paste0("file://", fixture)) + pdf_page_count(doc) + pdf_doc_close(doc) +} +} +\seealso{ +\code{\link[=pdf_doc_open]{pdf_doc_open()}} for the doc-open primitive. +} diff --git a/tests/testthat/test-doc-open-url.R b/tests/testthat/test-doc-open-url.R new file mode 100644 index 0000000..989fd2f --- /dev/null +++ b/tests/testthat/test-doc-open-url.R @@ -0,0 +1,73 @@ +# Tests for pdf_doc_open_url(). The network test paths are +# necessarily skipped on CRAN — they use the `file://` scheme +# against a shipped fixture, which exercises the same url() + +# readBin() code path as a real `https://` URL without needing +# network access. + +test_that("pdf_doc_open_url opens a file:// URL", { + url <- paste0("file://", fixture_path("minimal")) + doc <- pdf_doc_open_url(url) + on.exit(pdf_doc_close(doc), add = TRUE) + expect_s3_class(doc, "pdfium_doc") + expect_identical(pdf_page_count(doc), 1L) +}) + +test_that("pdf_doc_open_url stores the URL as the doc path", { + url <- paste0("file://", fixture_path("minimal")) + doc <- pdf_doc_open_url(url) + on.exit(pdf_doc_close(doc), add = TRUE) + expect_identical(doc$path, url) +}) + +test_that("pdf_doc_open_url forwards password + readwrite flags", { + url <- paste0("file://", fixture_path("minimal")) + doc <- pdf_doc_open_url(url, password = NULL, readwrite = TRUE) + on.exit(pdf_doc_close(doc), add = TRUE) + expect_true(doc$readwrite) +}) + +test_that("pdf_doc_open_url rejects non-URL strings", { + expect_error(pdf_doc_open_url("not-a-url"), + "must start with http://") + expect_error(pdf_doc_open_url("/path/to/file.pdf"), + "must start with http://") + expect_error(pdf_doc_open_url(""), "Assertion on") +}) + +test_that("pdf_doc_open_url rejects bad input types", { + expect_error(pdf_doc_open_url(42L), "Assertion on") + expect_error(pdf_doc_open_url(NULL), "Assertion on") + expect_error(pdf_doc_open_url(c("a", "b")), "Assertion on") +}) + +test_that("pdf_doc_open_url surfaces URL connection errors", { + bad_url <- "file:///definitely-not-a-file-on-this-system.pdf" + suppressWarnings(expect_error(pdf_doc_open_url(bad_url))) +}) + +test_that("pdf_doc_open_url accepts http(s) URLs structurally", { + # We can't actually fetch http(s) without network access, but the + # URL-shape validation should accept these prefixes and only fail + # later at the network step. base::url() emits a warning then + # errors on unreachable hosts; suppressWarnings so the test + # output isn't noisy. + suppressWarnings({ + expect_error(pdf_doc_open_url("https://example.invalid/x.pdf")) + expect_error(pdf_doc_open_url("http://example.invalid/x.pdf")) + # Neither error should be the URL-shape error. + err1 <- tryCatch( + pdf_doc_open_url("https://example.invalid/x.pdf"), + error = function(e) conditionMessage(e) + ) + }) + expect_false(grepl("must start with", err1)) +}) + +test_that("pdf_doc_open_url round-trips through pdf_doc_summary", { + url <- paste0("file://", fixture_path("annotated")) + doc <- pdf_doc_open_url(url) + on.exit(pdf_doc_close(doc), add = TRUE) + s <- pdf_doc_summary(doc) + expect_identical(s$path, url) + expect_gt(s$form_field_count, 0L) # annotated.pdf has form fields +}) From 7c8fc1bb44802200ee37689b899d947f0bdcf104 Mon Sep 17 00:00:00 2001 From: Bill Denney Date: Thu, 21 May 2026 16:40:20 +0000 Subject: [PATCH 08/10] fix(ci): unbreak Rd cross-ref check + add a pre-commit hook to catch it MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CI flagged R CMD check WARNING on every platform: pdf_doc_open_url.Rd: print.pdfium_doc Please provide package anchors for all Rd \link{} targets not in the package itself and the base packages. The pdf_doc_open_url docstring had `[print()][print.pdfium_doc]` — markdown for "render `print()` as link to topic `print.pdfium_doc`". But `print.pdfium_doc` is an internal S3 method without its own Rd page, so the link can't resolve. Two changes: 1. Replace the bracketed cross-reference with plain `print()` inline code so the function name still renders as code but doesn't generate a broken link. Mirrors the same fix pattern used on PR #32's `[is_open()]` issue. 2. New pre-commit hook `rd-xref-check` (entry: tools/check-rd-xrefs.R) that runs the same internal R function `R CMD check` uses for its cross-reference step (tools:::.check_Rd_xrefs). Catches this class of WARNING on the developer machine before push. The script needs nothing more than the source tree (no install, no compile, no C++ build), so it's cheap enough to run on every commit — listed under `repo: local` next to the existing `pkgdown-reference-check` hook, with the same diagnostic-then- exit-1 pattern. Trigger files: any change to R/*.R, man/*.Rd, or DESCRIPTION. Manual verification: injecting the original broken link reproduces the CI failure shape: [check-rd-xrefs] pdf_doc_open_url.Rd: unresolved \link{} target 'print.pdfium_doc' Restoring the fix exits 0. Co-Authored-By: Claude Opus 4.7 (1M context) --- .pre-commit-config.yaml | 17 ++++++++++ R/document.R | 2 +- man/pdf_doc_open_url.Rd | 2 +- tools/check-rd-xrefs.R | 73 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 92 insertions(+), 2 deletions(-) create mode 100755 tools/check-rd-xrefs.R diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 72f38bb..44ebf21 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -114,6 +114,23 @@ repos: pass_filenames: false files: '^(_pkgdown\.yml|NAMESPACE|R/.*\.R)$' + - id: rd-xref-check + name: Rd cross-reference resolution + description: > + Catches \link{} targets in man/*.Rd that don't resolve to + either a topic in this package, a documented dependency, or + one of the base / recommended packages. This is the same + WARNING that `R CMD check --as-cran` emits under "checking + Rd cross-references" and that fails every platform of the + cross-platform R-CMD-check matrix. Uses the same internal R + function (tools:::.check_Rd_xrefs) R CMD check itself uses; + operates on the source tree without needing the package + installed - fast enough for every commit. + entry: Rscript tools/check-rd-xrefs.R + language: system + pass_filenames: false + files: '^(R/.*\.R|man/.*\.Rd|DESCRIPTION)$' + # Conventional Commits message check (server side: defense in depth via CI). - repo: https://github.com/compilerla/conventional-pre-commit rev: v3.6.0 diff --git a/R/document.R b/R/document.R index f949511..7710229 100644 --- a/R/document.R +++ b/R/document.R @@ -81,7 +81,7 @@ pdf_doc_open <- function(path = NULL, source = NULL, password = NULL, #' Network errors propagate from [`url()`] / [`readBin()`] (typical #' shape: `cannot open URL '...'` from `connection failed`). The #' returned `pdfium_doc`'s `$path` field is the URL string itself, -#' so [print()][print.pdfium_doc] and [pdf_doc_summary()] surface +#' so `print()` and [pdf_doc_summary()] surface #' the source even though no local path exists. #' #' @param url Character scalar. Must start with one of `http://`, diff --git a/man/pdf_doc_open_url.Rd b/man/pdf_doc_open_url.Rd index de9259a..99e86c8 100644 --- a/man/pdf_doc_open_url.Rd +++ b/man/pdf_doc_open_url.Rd @@ -29,7 +29,7 @@ disk; the bytes live in R memory for the document's lifetime. Network errors propagate from \code{\link[=url]{url()}} / \code{\link[=readBin]{readBin()}} (typical shape: \verb{cannot open URL '...'} from \verb{connection failed}). The returned \code{pdfium_doc}'s \verb{$path} field is the URL string itself, -so \link[=print.pdfium_doc]{print()} and \code{\link[=pdf_doc_summary]{pdf_doc_summary()}} surface +so \code{print()} and \code{\link[=pdf_doc_summary]{pdf_doc_summary()}} surface the source even though no local path exists. } \examples{ diff --git a/tools/check-rd-xrefs.R b/tools/check-rd-xrefs.R new file mode 100755 index 0000000..d5a1b67 --- /dev/null +++ b/tools/check-rd-xrefs.R @@ -0,0 +1,73 @@ +#!/usr/bin/env Rscript +# tools/check-rd-xrefs.R +# +# Validates that every \link{} target in every Rd file under man/ +# resolves — either to a topic in this package, to a topic in a +# documented dependency, or to one of the base / recommended +# packages. +# +# Catches the same class of WARNING that `R CMD check --as-cran` +# emits under "checking Rd cross-references": +# +# Found the following Rd file(s) with Rd \link{} targets +# missing package anchors: +# pdf_X.Rd: some_unresolved_topic +# +# That WARNING fails the cross-platform R-CMD-check matrix on every +# CI platform — better to catch it on the developer's machine. +# +# Uses `tools:::.check_Rd_xrefs()` (R-internal) which is the same +# function `R CMD check` itself invokes for this check. It only +# needs the source tree (no install / no compile), so it's +# cheap enough to run on every push. +# +# Entry point for the corresponding pre-commit hook in +# .pre-commit-config.yaml. Exits 0 when every Rd cross-reference +# resolves; 1 with a diagnostic when at least one does not. +# +# Skips silently when: +# - the `tools` package isn't available (shouldn't happen — it's +# bundled with base R, but defensive nonetheless) +# - the package has no DESCRIPTION (run from somewhere other than +# a package root) + +local({ + if (!file.exists("DESCRIPTION")) { + message("[check-rd-xrefs] Not in a package root; skipping.") + return(invisible()) + } + # `tools:::.check_Rd_xrefs` is internal; existence-check first. + fn <- tryCatch( + get(".check_Rd_xrefs", envir = asNamespace("tools"), + inherits = FALSE), + error = function(e) NULL + ) + if (is.null(fn)) { + message("[check-rd-xrefs] tools:::.check_Rd_xrefs not available ", + "in this R; skipping. R-CMD-check on CI will still catch.") + return(invisible()) + } + + result <- fn(dir = ".") + if (length(result$bad) == 0L) { + return(invisible()) + } + + for (rd_file in names(result$bad)) { + topics <- result$bad[[rd_file]] + # Topics arrive as a named character; the names are the report + # categories (`report`, `legacy`, etc.) and the values are the + # unresolved topic strings. + for (topic in unique(topics)) { + message(sprintf( + "[check-rd-xrefs] %s: unresolved \\link{} target '%s'", + rd_file, topic + )) + } + } + message("[check-rd-xrefs] Fix by either qualifying the link with ", + "a package name (e.g. [graphics::plot()]), pointing it at ", + "an actual Rd topic in this package, or replacing the link ", + "with plain `code` formatting if the target has no Rd page.") + quit(status = 1L, save = "no") +}) From 6f16c08e59bf2f7a551ebbbab2a9cc653237017c Mon Sep 17 00:00:00 2001 From: Bill Denney Date: Thu, 21 May 2026 17:52:18 +0000 Subject: [PATCH 09/10] fix(ci): list summary.* methods in pkgdown index + harden the hook MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pkgdown's `build_reference_index()` errored on PR #36: In _pkgdown.yml, 2 topics missing from index: "summary.pdfium_doc" and "summary.pdfium_page" Both S3 methods have their own @export and so their own Rd page (summary.pdfium_doc.Rd, summary.pdfium_page.Rd). pkgdown enforces that every non-internal Rd topic appears in the reference index. Two changes: 1. Add both to _pkgdown.yml next to their `pdf_*_summary()` companions. 2. Rewrite tools/check-pkgdown-reference.R to enumerate man/*.Rd files directly rather than reconstruct the topic set from NAMESPACE's `export()` + `S3method()` entries. The old design only flagged missing topics for NAMESPACE `export()` entries — `S3method()`-only entries (the path that produces summary.*.Rd) slipped through. The new design: * filters out Rd files marked `\keyword{internal}` (matches the only existing internal Rd, pdfium-package.Rd) * computes the topic set as { Rd basename } ∪ { every \alias{} entry inside }, so @rdname-collapsed methods (e.g. each pdfium_*_code paired with its _name in one Rd) still count as valid YAML entries * flags topics missing from YAML and YAML entries missing from man/, same diagnostic shape as before Manual verification: removing the summary.* entries from _pkgdown.yml reproduces the CI failure shape: [check-pkgdown-reference] Documented but not in _pkgdown.yml reference index: summary.pdfium_doc, summary.pdfium_page Restoring exits 0. Co-Authored-By: Claude Opus 4.7 (1M context) --- _pkgdown.yml | 2 ++ tools/check-pkgdown-reference.R | 51 +++++++++++++++++++++------------ 2 files changed, 34 insertions(+), 19 deletions(-) diff --git a/_pkgdown.yml b/_pkgdown.yml index 69571e8..b5a94cf 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -32,6 +32,7 @@ reference: - pdf_doc_info - pdf_doc_meta - pdf_doc_summary + - summary.pdfium_doc - pdf_parse_date - pdf_doc_text - pdf_doc_fonts @@ -95,6 +96,7 @@ reference: - pdf_page_rotation - pdf_page_box - pdf_pages_summary + - summary.pdfium_page - pdf_page_links - pdf_link_at_point - pdf_link_annot_at_point diff --git a/tools/check-pkgdown-reference.R b/tools/check-pkgdown-reference.R index 3c874bd..9290fcf 100644 --- a/tools/check-pkgdown-reference.R +++ b/tools/check-pkgdown-reference.R @@ -40,35 +40,48 @@ local({ use.names = FALSE)) yaml_topics <- yaml_topics[!is.na(yaml_topics) & nzchar(yaml_topics)] - ns_lines <- readLines("NAMESPACE") - # `exportPattern` and `exportClasses` not handled - this hook is - # for the typical case where pdfium's exports are all `export(name)`. - exports <- sub("^export\\(([^)]+)\\)$", "\\1", - grep("^export\\(", ns_lines, value = TRUE)) - # S3 methods registered via `@exportS3Method` appear in NAMESPACE - # as `S3method(generic, class)` or `S3method(pkg::generic, class)` - # when the generic lives in another package (e.g. - # `S3method(graphics::plot, pdfium_bitmap)`). pkgdown writes the - # method's Rd topic as the bare `generic.class` in either case, - # so strip any `pkg::` prefix on the generic before forming the - # topic name. - s3 <- sub("^S3method\\(([^,]+),\\s*([^)]+)\\)$", "\\1.\\2", - grep("^S3method\\(", ns_lines, value = TRUE)) - s3 <- sub("^[^.]+::", "", s3) - topics <- unique(c(exports, s3)) + # The canonical "what topics should pkgdown index" set is every + # man/*.Rd file that isn't marked `\keyword{internal}`. Walking + # the Rd files directly handles every topic-creation path + # (`@export`, S3 methods registered via `@exportS3Method`, manual + # `@aliases`/`@rdname`-collapsed methods) without having to + # re-parse NAMESPACE's S3method dispatch records. + rd_files <- list.files("man", pattern = "\\.Rd$", full.names = FALSE) + rd_files <- rd_files[nzchar(rd_files)] + if (length(rd_files) == 0L) { + return(invisible()) + } + + # A topic in pkgdown's reference can be either the Rd file's + # basename OR any \alias{} entry inside it (the @rdname-collapsed + # case: when several R functions share one Rd file, every + # function's name becomes an alias for the shared topic). Both + # forms resolve, so both count as valid YAML entries. + rd_topics_and_aliases <- function(rd_file) { + lines <- readLines(file.path("man", rd_file), warn = FALSE) + if (any(grepl("\\\\keyword\\{internal\\}", lines))) { + return(character(0)) + } + base <- sub("\\.Rd$", "", rd_file) + aliases <- sub(".*\\\\alias\\{([^}]+)\\}.*", "\\1", + grep("\\\\alias\\{", lines, value = TRUE)) + unique(c(base, aliases)) + } + topics <- unique(unlist(lapply(rd_files, rd_topics_and_aliases), + use.names = FALSE)) - missing_in_yaml <- setdiff(exports, yaml_topics) + missing_in_yaml <- setdiff(topics, yaml_topics) unknown_in_yaml <- setdiff(yaml_topics, topics) problems <- character() if (length(missing_in_yaml) > 0L) { problems <- c(problems, sprintf( - "Exported but not in _pkgdown.yml reference index: %s", + "Documented but not in _pkgdown.yml reference index: %s", paste(missing_in_yaml, collapse = ", "))) } if (length(unknown_in_yaml) > 0L) { problems <- c(problems, sprintf( - "In _pkgdown.yml reference index but not an export: %s", + "In _pkgdown.yml reference index but no matching man/*.Rd: %s", paste(unknown_in_yaml, collapse = ", "))) } if (length(problems) > 0L) { From ffdf5aa04885a3e259b8d96564d6fae5d0f085e4 Mon Sep 17 00:00:00 2001 From: Bill Denney Date: Thu, 21 May 2026 17:56:07 +0000 Subject: [PATCH 10/10] =?UTF-8?q?feat(doc):=20add=20pdf=5Fdir=5Fsummary()?= =?UTF-8?q?=20=E2=80=94=20bulk-triage=20every=20PDF=20in=20a=20folder?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Scans a directory for PDF files and returns a tibble with one row per file in the pdf_doc_summary() column shape. The natural replacement for the standard "loop over a folder of PDFs and find the ones with forms / attachments / encryption" triage workflow. * Recursive descent via `recursive = TRUE`. * Case-insensitive `pattern = "\\.pdf$"` by default — picks up both `.pdf` and `.PDF`. * Optional shared `password` applied to every file. * `errors` argument selects how broken / non-PDF files are handled: * "warn" (default) — surface a warning per failure and skip * "skip" — silently skip * "stop" — abort on the first failure Internal pdf_doc_summary_empty() helper hoisted to module scope so its zero-row template can be tested without exercising the no-files-in-directory branch through the full file scan. 14 new tests in test-dir-summary.R cover row-per-PDF count, column shape parity with pdf_doc_summary, path preservation, recursive descent, the empty-directory case, the empty-tibble helper itself, case-insensitive .PDF matching, all three errors modes, the zero-rows-when-everything-fails case, password forwarding, input-shape rejection, and custom patterns. Full suite 2166/2166 pass; R coverage 100% (2907/2907 lines); 0 lints; pkgdown reference check passes (the hardened hook from the previous commit caught my omission of pdf_dir_summary from _pkgdown.yml during this commit's development — verifying the hook does what it should). Co-Authored-By: Claude Opus 4.7 (1M context) --- NAMESPACE | 1 + NEWS.md | 7 ++ R/doc.R | 110 ++++++++++++++++++++++++++ _pkgdown.yml | 1 + man/pdf_dir_summary.Rd | 62 +++++++++++++++ tests/testthat/test-dir-summary.R | 125 ++++++++++++++++++++++++++++++ 6 files changed, 306 insertions(+) create mode 100644 man/pdf_dir_summary.Rd create mode 100644 tests/testthat/test-dir-summary.R diff --git a/NAMESPACE b/NAMESPACE index 6b7f67a..9b1b700 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -105,6 +105,7 @@ export(pdf_bookmark_title) export(pdf_bookmark_uri) export(pdf_clip_path_count) export(pdf_clip_path_segments) +export(pdf_dir_summary) export(pdf_doc_bookmark_find) export(pdf_doc_bookmarks) export(pdf_doc_close) diff --git a/NEWS.md b/NEWS.md index 380325f..9b1414a 100644 --- a/NEWS.md +++ b/NEWS.md @@ -40,6 +40,13 @@ PDFs created with `pdf_doc_new()` are also writable). dispatch to the matching tibble — `summary(page)` adds the page-loaded counts (annotation count, page-object count, text-run count, link count) since the page is already loaded. +* `pdf_dir_summary(dir)` — scans a directory for PDF files and + returns one row per file in the `pdf_doc_summary()` shape. + Recursive scan via `recursive = TRUE`; pattern-matches `.pdf` + case-insensitively by default. The `errors` argument selects + one of `"warn"` (default — surface broken files but don't + abort), `"skip"` (silently drop), or `"stop"` (abort on the + first failure). ## Page objects, paths, and text diff --git a/R/doc.R b/R/doc.R index d0ccc75..6114187 100644 --- a/R/doc.R +++ b/R/doc.R @@ -642,6 +642,116 @@ summary.pdfium_doc <- function(object, ...) { pdf_doc_summary(object) } +#' Summarise every PDF in a directory in one call +#' +#' Scans a directory for PDF files and returns a tibble whose rows +#' are the [pdf_doc_summary()] output for each file. The natural +#' replacement for the standard "loop over a folder of PDFs and +#' triage" workflow — encrypted-which / has-forms-which / +#' has-attachments-which. +#' +#' Files that fail to open (corrupt, wrong format, password +#' protected) are handled per the `errors` argument: +#' +#' * `"warn"` (default) — a `warning()` per failed file; the file +#' is dropped from the result tibble. +#' * `"skip"` — silently dropped. +#' * `"stop"` — the first failed file raises an error and the +#' function aborts. +#' +#' @param dir Character scalar. Path to the directory to scan. +#' @param pattern Regular expression filtering filenames. Defaults +#' to `"\\.pdf$"` (case-insensitive). +#' @param recursive Logical. When `TRUE`, descend into +#' subdirectories. Defaults `FALSE`. +#' @param password Optional password applied to every file. `NULL` +#' (default) tries each file without a password. Useful when all +#' files share the same password. +#' @param errors One of `"warn"`, `"skip"`, `"stop"` — see Details. +#' @return A tibble with the same columns as [pdf_doc_summary()]. +#' Zero rows when the directory has no PDFs (or every PDF failed +#' to open under `errors = "skip"` / `"warn"`). +#' @seealso [pdf_doc_summary()] for the single-file companion. +#' @examples +#' fixture_dir <- system.file("extdata", "fixtures", +#' package = "pdfium") +#' if (nzchar(fixture_dir)) { +#' pdf_dir_summary(fixture_dir) +#' } +#' @export +pdf_dir_summary <- function(dir = ".", pattern = "\\.pdf$", + recursive = FALSE, password = NULL, + errors = c("warn", "skip", "stop")) { + checkmate::assert_directory_exists(dir) + checkmate::assert_string(pattern) + checkmate::assert_flag(recursive) + errors <- match.arg(errors) + + files <- list.files(dir, pattern = pattern, recursive = recursive, + full.names = TRUE, ignore.case = TRUE) + if (length(files) == 0L) { + return(pdf_doc_summary_empty()) + } + + rows <- lapply(files, function(f) { + tryCatch( + pdf_doc_summary(f, password = password), + error = function(e) { + if (errors == "stop") { + stop(sprintf("pdf_dir_summary: failed to read '%s': %s", + f, conditionMessage(e)), call. = FALSE) + } + if (errors == "warn") { + warning(sprintf("pdf_dir_summary: failed to read '%s': %s", + f, conditionMessage(e)), call. = FALSE) + } + NULL + } + ) + }) + ok <- !vapply(rows, is.null, logical(1L)) + if (!any(ok)) { + return(pdf_doc_summary_empty()) + } + out <- do.call(rbind, rows[ok]) + tibble::as_tibble(out) +} + +# Internal: zero-row tibble matching pdf_doc_summary's column shape. +# Used by pdf_dir_summary() when the directory is empty (or every +# file failed under `errors = "skip"` / `"warn"`). +pdf_doc_summary_empty <- function() { + tibble::tibble( + path = character(), + page_count = integer(), + file_version = integer(), + title = character(), + author = character(), + subject = character(), + keywords = character(), + creator = character(), + producer = character(), + creation_date = character(), + mod_date = character(), + trapped = character(), + creation_date_parsed = as.POSIXct(character(), tz = "UTC"), + mod_date_parsed = as.POSIXct(character(), tz = "UTC"), + is_tagged = logical(), + is_encrypted = logical(), + security_revision = integer(), + xref_valid = logical(), + bookmark_count = integer(), + attachment_count = integer(), + signature_count = integer(), + form_field_count = integer(), + javascript_count = integer(), + named_dest_count = integer(), + has_page_labels = logical(), + file_id_permanent = character(), + file_id_changing = character() + ) +} + # Internal: convert pdf_doc_file_id()'s raw return to a hex string, # or NA_character_ when empty. Hoisted from pdf_doc_summary so its # two branches can be unit-tested without a fixture that carries an diff --git a/_pkgdown.yml b/_pkgdown.yml index b5a94cf..57e4679 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -32,6 +32,7 @@ reference: - pdf_doc_info - pdf_doc_meta - pdf_doc_summary + - pdf_dir_summary - summary.pdfium_doc - pdf_parse_date - pdf_doc_text diff --git a/man/pdf_dir_summary.Rd b/man/pdf_dir_summary.Rd new file mode 100644 index 0000000..aeafc80 --- /dev/null +++ b/man/pdf_dir_summary.Rd @@ -0,0 +1,62 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/doc.R +\name{pdf_dir_summary} +\alias{pdf_dir_summary} +\title{Summarise every PDF in a directory in one call} +\usage{ +pdf_dir_summary( + dir = ".", + pattern = "\\\\.pdf$", + recursive = FALSE, + password = NULL, + errors = c("warn", "skip", "stop") +) +} +\arguments{ +\item{dir}{Character scalar. Path to the directory to scan.} + +\item{pattern}{Regular expression filtering filenames. Defaults +to \code{"\\\\.pdf$"} (case-insensitive).} + +\item{recursive}{Logical. When \code{TRUE}, descend into +subdirectories. Defaults \code{FALSE}.} + +\item{password}{Optional password applied to every file. \code{NULL} +(default) tries each file without a password. Useful when all +files share the same password.} + +\item{errors}{One of \code{"warn"}, \code{"skip"}, \code{"stop"} — see Details.} +} +\value{ +A tibble with the same columns as \code{\link[=pdf_doc_summary]{pdf_doc_summary()}}. +Zero rows when the directory has no PDFs (or every PDF failed +to open under \code{errors = "skip"} / \code{"warn"}). +} +\description{ +Scans a directory for PDF files and returns a tibble whose rows +are the \code{\link[=pdf_doc_summary]{pdf_doc_summary()}} output for each file. The natural +replacement for the standard "loop over a folder of PDFs and +triage" workflow — encrypted-which / has-forms-which / +has-attachments-which. +} +\details{ +Files that fail to open (corrupt, wrong format, password +protected) are handled per the \code{errors} argument: +\itemize{ +\item \code{"warn"} (default) — a \code{warning()} per failed file; the file +is dropped from the result tibble. +\item \code{"skip"} — silently dropped. +\item \code{"stop"} — the first failed file raises an error and the +function aborts. +} +} +\examples{ +fixture_dir <- system.file("extdata", "fixtures", + package = "pdfium") +if (nzchar(fixture_dir)) { + pdf_dir_summary(fixture_dir) +} +} +\seealso{ +\code{\link[=pdf_doc_summary]{pdf_doc_summary()}} for the single-file companion. +} diff --git a/tests/testthat/test-dir-summary.R b/tests/testthat/test-dir-summary.R new file mode 100644 index 0000000..437584b --- /dev/null +++ b/tests/testthat/test-dir-summary.R @@ -0,0 +1,125 @@ +# Tests for pdf_dir_summary() — the bulk-triage helper that wraps +# pdf_doc_summary() over every PDF in a directory. + +# Helper to expose the shipped fixture directory. +fixture_dir <- function() { + system.file("extdata", "fixtures", package = "pdfium") +} + +test_that("pdf_dir_summary returns a tibble with one row per PDF", { + s <- pdf_dir_summary(fixture_dir()) + expect_s3_class(s, "tbl_df") + files <- list.files(fixture_dir(), pattern = "\\.pdf$") + expect_equal(nrow(s), length(files)) +}) + +test_that("pdf_dir_summary column shape matches pdf_doc_summary", { + bulk <- pdf_dir_summary(fixture_dir()) + one <- pdf_doc_summary(fixture_path("shapes")) + expect_named(bulk, names(one)) +}) + +test_that("pdf_dir_summary preserves the path column", { + s <- pdf_dir_summary(fixture_dir()) + expect_true(all(grepl("\\.pdf$", s$path))) + expect_true(all(file.exists(s$path))) +}) + +test_that("pdf_dir_summary recursive descent works", { + # Create a nested temp dir with two PDFs, one in a subdir. + tmp <- withr::local_tempdir() + file.copy(fixture_path("minimal"), file.path(tmp, "top.pdf")) + sub <- file.path(tmp, "subdir") + dir.create(sub) + file.copy(fixture_path("minimal"), file.path(sub, "nested.pdf")) + + flat <- pdf_dir_summary(tmp, recursive = FALSE) + expect_equal(nrow(flat), 1L) + + deep <- pdf_dir_summary(tmp, recursive = TRUE) + expect_equal(nrow(deep), 2L) +}) + +test_that("pdf_dir_summary returns zero rows for an empty dir", { + tmp <- withr::local_tempdir() + s <- pdf_dir_summary(tmp) + expect_s3_class(s, "tbl_df") + expect_equal(nrow(s), 0L) +}) + +test_that("pdf_dir_summary's empty tibble has the right shape", { + empty <- pdfium:::pdf_doc_summary_empty() + expect_s3_class(empty, "tbl_df") + expect_equal(nrow(empty), 0L) + one <- pdf_doc_summary(fixture_path("shapes")) + expect_named(empty, names(one)) +}) + +test_that("pdf_dir_summary case-insensitive PDF pattern matches .PDF too", { + tmp <- withr::local_tempdir() + file.copy(fixture_path("minimal"), file.path(tmp, "upper.PDF")) + file.copy(fixture_path("minimal"), file.path(tmp, "lower.pdf")) + s <- pdf_dir_summary(tmp) + expect_equal(nrow(s), 2L) +}) + +test_that("pdf_dir_summary errors = stop aborts on a bad file", { + tmp <- withr::local_tempdir() + file.copy(fixture_path("minimal"), file.path(tmp, "good.pdf")) + writeLines("not a pdf", file.path(tmp, "bad.pdf")) + expect_error( + pdf_dir_summary(tmp, errors = "stop"), + "failed to read" + ) +}) + +test_that("pdf_dir_summary errors = warn drops bad files with a warning", { + tmp <- withr::local_tempdir() + file.copy(fixture_path("minimal"), file.path(tmp, "good.pdf")) + writeLines("not a pdf", file.path(tmp, "bad.pdf")) + s <- suppressWarnings(pdf_dir_summary(tmp, errors = "warn")) + expect_equal(nrow(s), 1L) + expect_warning( + pdf_dir_summary(tmp, errors = "warn"), + "failed to read" + ) +}) + +test_that("pdf_dir_summary errors = skip silently drops bad files", { + tmp <- withr::local_tempdir() + file.copy(fixture_path("minimal"), file.path(tmp, "good.pdf")) + writeLines("not a pdf", file.path(tmp, "bad.pdf")) + expect_no_warning(s <- pdf_dir_summary(tmp, errors = "skip")) + expect_equal(nrow(s), 1L) +}) + +test_that("pdf_dir_summary returns zero rows when every file fails", { + tmp <- withr::local_tempdir() + writeLines("not a pdf", file.path(tmp, "bad1.pdf")) + writeLines("also not a pdf", file.path(tmp, "bad2.pdf")) + s <- suppressWarnings(pdf_dir_summary(tmp, errors = "skip")) + expect_equal(nrow(s), 0L) +}) + +test_that("pdf_dir_summary forwards the password argument", { + s <- pdf_dir_summary(fixture_dir(), password = NULL) + expect_gt(nrow(s), 0L) +}) + +test_that("pdf_dir_summary rejects bad inputs", { + expect_error(pdf_dir_summary("/this/path/does/not/exist"), + "Assertion on") + expect_error(pdf_dir_summary(fixture_dir(), pattern = NA_character_), + "Assertion on") + expect_error(pdf_dir_summary(fixture_dir(), recursive = "yes"), + "Assertion on") + expect_error(pdf_dir_summary(fixture_dir(), errors = "bogus"), + "'arg' should be one of") +}) + +test_that("pdf_dir_summary respects a custom pattern", { + # Only match the annotated fixture. + s <- pdf_dir_summary(fixture_dir(), pattern = "^annotated\\.pdf$") + expect_equal(nrow(s), 1L) + expect_match(s$path[[1L]], "annotated\\.pdf$") +})