Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,23 @@ repos:
pass_filenames: false
files: '^(_pkgdown\.yml|NAMESPACE|R/.*\.R)$'

- id: rd-xref-check
name: Rd cross-reference resolution
description: >
Catches \link{} targets in man/*.Rd that don't resolve to
either a topic in this package, a documented dependency, or
one of the base / recommended packages. This is the same
WARNING that `R CMD check --as-cran` emits under "checking
Rd cross-references" and that fails every platform of the
cross-platform R-CMD-check matrix. Uses the same internal R
function (tools:::.check_Rd_xrefs) R CMD check itself uses;
operates on the source tree without needing the package
installed - fast enough for every commit.
entry: Rscript tools/check-rd-xrefs.R
language: system
pass_filenames: false
files: '^(R/.*\.R|man/.*\.Rd|DESCRIPTION)$'

# Conventional Commits message check (server side: defense in depth via CI).
- repo: https://github.com/compilerla/conventional-pre-commit
rev: v3.6.0
Expand Down
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Package: pdfium
Title: Idiomatic R Bindings to the PDFium PDF Engine
Version: 0.1.0
Version: 0.0.9000
Authors@R: c(
person("Bill", "Denney", , "wdenney@humanpredictions.com",
role = c("aut", "cre"),
Expand Down
6 changes: 6 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ S3method(print,pdfium_obj_list)
S3method(print,pdfium_page)
S3method(print,pdfium_signature)
S3method(print,pdfium_signature_list)
S3method(summary,pdfium_doc)
S3method(summary,pdfium_page)
export(as_pdfium_annot_list)
export(as_pdfium_attachment_list)
export(as_pdfium_bookmark_list)
Expand Down Expand Up @@ -103,6 +105,7 @@ export(pdf_bookmark_title)
export(pdf_bookmark_uri)
export(pdf_clip_path_count)
export(pdf_clip_path_segments)
export(pdf_dir_summary)
export(pdf_doc_bookmark_find)
export(pdf_doc_bookmarks)
export(pdf_doc_close)
Expand All @@ -117,10 +120,12 @@ export(pdf_doc_named_dest_by_name)
export(pdf_doc_named_dests)
export(pdf_doc_new)
export(pdf_doc_open)
export(pdf_doc_open_url)
export(pdf_doc_page_mode)
export(pdf_doc_permissions)
export(pdf_doc_security)
export(pdf_doc_set_language)
export(pdf_doc_summary)
export(pdf_doc_text)
export(pdf_doc_trailer_ends)
export(pdf_doc_user_permissions)
Expand Down Expand Up @@ -196,6 +201,7 @@ export(pdf_page_set_rotation)
export(pdf_page_size)
export(pdf_page_thumbnail)
export(pdf_pages_reorder)
export(pdf_pages_summary)
export(pdf_parse_date)
export(pdf_path_append)
export(pdf_path_bezier_to)
Expand Down
22 changes: 21 additions & 1 deletion NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,10 @@ PDFs created with `pdf_doc_new()` are also writable).
* `pdf_doc_open()` / `pdf_doc_close()`, `pdf_doc_new()`,
`pdf_save()` / `pdf_save_to_raw()` — open existing PDFs (optionally
with `readwrite = TRUE`), build new ones in memory, and persist
the result.
the result. `pdf_doc_open_url(url)` is a convenience wrapper that
fetches a `http://` / `https://` / `ftp://` / `file://` URL via
`url()` + `readBin()` and loads the bytes through PDFium's
in-memory path — no temporary file on disk.
* `pdf_doc_info()`, `pdf_doc_meta()`, `pdf_doc_text()`,
`pdf_doc_fonts()`, `pdf_doc_file_id()`, `pdf_doc_page_mode()`,
`pdf_doc_viewer_preferences()`, `pdf_doc_viewer_preference_by_name()`,
Expand All @@ -27,6 +30,23 @@ PDFs created with `pdf_doc_new()` are also writable).
* `pdf_page_load()` / `pdf_page_close()`, `pdf_page_size()`,
`pdf_page_rotation()`, `pdf_page_box()`, `pdf_page_thumbnail()` —
per-page handles and metadata.
* `pdf_doc_summary()` and `pdf_pages_summary()` — one-call triage
helpers. `pdf_doc_summary()` returns a single-row tibble
aggregating the most-asked-for facts about a PDF (path, page
count, Info-dictionary metadata, feature flags, per-feature
counts, file-ID tuple); `pdf_pages_summary()` is the per-page
sibling (width / height / rotation / label, all via the fast
by-index PDFium readers). `summary(doc)` and `summary(page)`
dispatch to the matching tibble — `summary(page)` adds the
page-loaded counts (annotation count, page-object count,
text-run count, link count) since the page is already loaded.
* `pdf_dir_summary(dir)` — scans a directory for PDF files and
returns one row per file in the `pdf_doc_summary()` shape.
Recursive scan via `recursive = TRUE`; pattern-matches `.pdf`
case-insensitively by default. The `errors` argument selects
one of `"warn"` (default — surface broken files but don't
abort), `"skip"` (silently drop), or `"stop"` (abort on the
first failure).

## Page objects, paths, and text

Expand Down
1 change: 1 addition & 0 deletions R/classes.R
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ print.pdfium_doc <- function(x, ...) {
invisible(x)
}


#' Construct a `pdfium_page` from an external pointer
#'
#' Internal helper. The page's externalptr carries its parent document's
Expand Down
256 changes: 256 additions & 0 deletions R/doc.R
Original file line number Diff line number Diff line change
Expand Up @@ -506,3 +506,259 @@ pdf_doc_trailer_ends <- function(doc) {
doc <- as_open_doc(doc)
cpp_doc_trailer_ends(doc$ptr)
}

#' One-call summary of a PDF document
#'
#' Returns a single-row tibble that aggregates the most-asked-for
#' facts about a PDF document: file path, page count, Info-dictionary
#' metadata, structural feature flags (forms, attachments, bookmarks,
#' signatures, JavaScript, tagged-PDF), counts for each of those
#' feature groups, encryption state, and the file-ID tuple. Designed
#' to replace the eight-or-so individual calls users typically chain
#' together when triaging a PDF.
#'
#' Each column either exposes an existing reader or is a `length()`
#' over the matching `pdfium_*_list`. No new C-side work — purely an
#' R-side aggregation. See **Columns** below for the source reader
#' for each entry.
#'
#' @section Columns:
#' * `path` — character; canonical path the doc was opened from, or
#' `"<raw bytes>"` for in-memory loads.
#' * `page_count`, `file_version` — from [pdf_doc_info()].
#' * `title`, `author`, `subject`, `keywords`, `creator`, `producer`,
#' `creation_date`, `mod_date`, `trapped` — from [pdf_doc_info()];
#' missing entries appear as `""`.
#' * `creation_date_parsed`, `mod_date_parsed` — POSIXct (UTC), `NA`
#' when the source date is empty or unparseable. From
#' [pdf_parse_date()].
#' * `is_tagged` — from [pdf_doc_is_tagged()].
#' * `is_encrypted` — `TRUE` when [pdf_doc_security()] returns a
#' non-NA revision; `FALSE` otherwise.
#' * `security_revision` — from [pdf_doc_security()]; `NA` for
#' unencrypted PDFs.
#' * `xref_valid` — from [pdf_doc_xref_valid()].
#' * `bookmark_count`, `attachment_count`, `signature_count`,
#' `form_field_count`, `javascript_count`, `named_dest_count` —
#' `length()` of [pdf_doc_bookmarks()], [pdf_attachments()],
#' [pdf_signatures()], [pdf_form_fields()], [pdf_doc_javascript()],
#' and [pdf_doc_named_dests()] respectively. Zero when the
#' document has none of the corresponding entries.
#' * `has_page_labels` — `TRUE` when [pdf_page_labels()] returns
#' non-NA strings.
#' * `file_id_permanent`, `file_id_changing` — from
#' [pdf_doc_file_id()]; UTF-8 hex strings or `NA`.
#'
#' @param doc A `pdfium_doc` from [pdf_doc_open()], or a character
#' path.
#' @param password Optional password for encrypted PDFs when `doc`
#' is a path. Ignored when `doc` is an open `pdfium_doc`.
#' @return A one-row tibble.
#' @seealso [pdf_doc_info()] for the Info-dictionary subset alone,
#' the per-feature readers listed under **Columns** for richer
#' per-row data.
#' @examples
#' fixture <- system.file("extdata", "fixtures", "annotated.pdf",
#' package = "pdfium"
#' )
#' if (nzchar(fixture)) pdf_doc_summary(fixture)
#' @export
pdf_doc_summary <- function(doc, password = NULL) {
if (is.character(doc)) {
handle <- pdf_doc_open(doc, password = password)
on.exit(pdf_doc_close(handle), add = TRUE)
return(pdf_doc_summary(handle))
}
checkmate::assert_class(doc, "pdfium_doc")
if (!is_open(doc)) stop("Document has been closed.", call. = FALSE)

info <- pdf_doc_info(doc)
rev <- pdf_doc_security(doc)
page_labels <- tryCatch(pdf_page_labels(doc),
error = function(e) NULL)
file_id <- list(
permanent = file_id_hex_or_na(tryCatch(
pdf_doc_file_id(doc, "permanent"),
error = function(e) raw(0)
)),
changing = file_id_hex_or_na(tryCatch(
pdf_doc_file_id(doc, "changing"),
error = function(e) raw(0)
))
)

tibble::tibble(
path = doc$path,
page_count = info$page_count,
file_version = info$file_version,
title = info$title %||% "",
author = info$author %||% "",
subject = info$subject %||% "",
keywords = info$keywords %||% "",
creator = info$creator %||% "",
producer = info$producer %||% "",
creation_date = info$creation_date %||% "",
mod_date = info$mod_date %||% "",
trapped = info$trapped %||% "",
creation_date_parsed = info$creation_date_parsed,
mod_date_parsed = info$mod_date_parsed,
is_tagged = pdf_doc_is_tagged(doc),
is_encrypted = !is.na(rev),
security_revision = rev,
xref_valid = pdf_doc_xref_valid(doc),
bookmark_count = length(pdf_doc_bookmarks(doc)),
attachment_count = length(pdf_attachments(doc)),
signature_count = length(pdf_signatures(doc)),
form_field_count = length(pdf_form_fields(doc)),
javascript_count = length(pdf_doc_javascript(doc)),
named_dest_count = length(pdf_doc_named_dests(doc)),
has_page_labels = !is.null(page_labels) &&
any(!is.na(page_labels) & nzchar(page_labels)),
file_id_permanent = file_id$permanent,
file_id_changing = file_id$changing
)
}

# Internal: tiny version of rlang's %||% so we don't pull rlang in
# just for the summary path. Returns `b` when `a` is NULL or NA.
`%||%` <- function(a, b) {
if (is.null(a) || (length(a) == 1L && is.na(a))) b else a
}

#' Document-level summary
#'
#' `summary()` method for `pdfium_doc`. Defers to
#' [pdf_doc_summary()] so users can call `summary(doc)` for the
#' single-row tibble of every key fact about the PDF — page count,
#' Info-dictionary metadata, structural feature flags, per-feature
#' counts, the file-ID tuple — in one call.
#'
#' @param object A `pdfium_doc` from [pdf_doc_open()].
#' @param ... Unused (S3 generic compatibility).
#' @return The tibble returned by [pdf_doc_summary()].
#' @seealso [pdf_doc_summary()].
#' @export
summary.pdfium_doc <- function(object, ...) {
pdf_doc_summary(object)
}

#' Summarise every PDF in a directory in one call
#'
#' Scans a directory for PDF files and returns a tibble whose rows
#' are the [pdf_doc_summary()] output for each file. The natural
#' replacement for the standard "loop over a folder of PDFs and
#' triage" workflow — encrypted-which / has-forms-which /
#' has-attachments-which.
#'
#' Files that fail to open (corrupt, wrong format, password
#' protected) are handled per the `errors` argument:
#'
#' * `"warn"` (default) — a `warning()` per failed file; the file
#' is dropped from the result tibble.
#' * `"skip"` — silently dropped.
#' * `"stop"` — the first failed file raises an error and the
#' function aborts.
#'
#' @param dir Character scalar. Path to the directory to scan.
#' @param pattern Regular expression filtering filenames. Defaults
#' to `"\\.pdf$"` (case-insensitive).
#' @param recursive Logical. When `TRUE`, descend into
#' subdirectories. Defaults `FALSE`.
#' @param password Optional password applied to every file. `NULL`
#' (default) tries each file without a password. Useful when all
#' files share the same password.
#' @param errors One of `"warn"`, `"skip"`, `"stop"` — see Details.
#' @return A tibble with the same columns as [pdf_doc_summary()].
#' Zero rows when the directory has no PDFs (or every PDF failed
#' to open under `errors = "skip"` / `"warn"`).
#' @seealso [pdf_doc_summary()] for the single-file companion.
#' @examples
#' fixture_dir <- system.file("extdata", "fixtures",
#' package = "pdfium")
#' if (nzchar(fixture_dir)) {
#' pdf_dir_summary(fixture_dir)
#' }
#' @export
pdf_dir_summary <- function(dir = ".", pattern = "\\.pdf$",
recursive = FALSE, password = NULL,
errors = c("warn", "skip", "stop")) {
checkmate::assert_directory_exists(dir)
checkmate::assert_string(pattern)
checkmate::assert_flag(recursive)
errors <- match.arg(errors)

files <- list.files(dir, pattern = pattern, recursive = recursive,
full.names = TRUE, ignore.case = TRUE)
if (length(files) == 0L) {
return(pdf_doc_summary_empty())
}

rows <- lapply(files, function(f) {
tryCatch(
pdf_doc_summary(f, password = password),
error = function(e) {
if (errors == "stop") {
stop(sprintf("pdf_dir_summary: failed to read '%s': %s",
f, conditionMessage(e)), call. = FALSE)
}
if (errors == "warn") {
warning(sprintf("pdf_dir_summary: failed to read '%s': %s",
f, conditionMessage(e)), call. = FALSE)
}
NULL
}
)
})
ok <- !vapply(rows, is.null, logical(1L))
if (!any(ok)) {
return(pdf_doc_summary_empty())
}
out <- do.call(rbind, rows[ok])
tibble::as_tibble(out)
}

# Internal: zero-row tibble matching pdf_doc_summary's column shape.
# Used by pdf_dir_summary() when the directory is empty (or every
# file failed under `errors = "skip"` / `"warn"`).
pdf_doc_summary_empty <- function() {
tibble::tibble(
path = character(),
page_count = integer(),
file_version = integer(),
title = character(),
author = character(),
subject = character(),
keywords = character(),
creator = character(),
producer = character(),
creation_date = character(),
mod_date = character(),
trapped = character(),
creation_date_parsed = as.POSIXct(character(), tz = "UTC"),
mod_date_parsed = as.POSIXct(character(), tz = "UTC"),
is_tagged = logical(),
is_encrypted = logical(),
security_revision = integer(),
xref_valid = logical(),
bookmark_count = integer(),
attachment_count = integer(),
signature_count = integer(),
form_field_count = integer(),
javascript_count = integer(),
named_dest_count = integer(),
has_page_labels = logical(),
file_id_permanent = character(),
file_id_changing = character()
)
}

# Internal: convert pdf_doc_file_id()'s raw return to a hex string,
# or NA_character_ when empty. Hoisted from pdf_doc_summary so its
# two branches can be unit-tested without a fixture that carries an
# `/ID` array (no shipped fixture does).
file_id_hex_or_na <- function(r) {
if (length(r) == 0L) {
return(NA_character_)
}
paste(format(r), collapse = "")
}
Loading
Loading