diff --git a/pyproject.toml b/pyproject.toml index 69d2a422..a28be645 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,6 +10,7 @@ requires-python = ">=3.10" dependencies = [ "exceptiongroup>=1.3.0", "httpx>=0.28.1", + "langcodes>=3.4.0", "pydantic>=2.12.0", ] diff --git a/src/pdfrest/client.py b/src/pdfrest/client.py index 8818899f..5c543630 100644 --- a/src/pdfrest/client.py +++ b/src/pdfrest/client.py @@ -66,41 +66,73 @@ PdfRestFileBasedResponse, PdfRestFileID, PdfRestInfoResponse, + SummarizePdfTextResponse, + TranslatePdfTextFileResponse, + TranslatePdfTextResponse, UpResponse, ) - -__all__ = ("AsyncPdfRestClient", "PdfRestClient") - from .models._internal import ( BasePdfRestGraphicPayload, BmpPdfRestPayload, + ConvertToMarkdownPayload, DeletePayload, + ExtractImagesPayload, + ExtractTextPayload, GifPdfRestPayload, JpegPdfRestPayload, + OcrPdfPayload, + PdfAddAttachmentPayload, PdfCompressPayload, + PdfFlattenAnnotationsPayload, PdfFlattenFormsPayload, + PdfFlattenTransparenciesPayload, PdfInfoPayload, + PdfLinearizePayload, PdfMergePayload, + PdfRasterizePayload, PdfRedactionApplyPayload, PdfRedactionPreviewPayload, PdfRestRawFileResponse, PdfSplitPayload, + PdfToExcelPayload, + PdfToPdfaPayload, PdfToPdfxPayload, + PdfToPowerpointPayload, PdfToWordPayload, + PdfXfaToAcroformsPayload, PngPdfRestPayload, + SummarizePdfTextPayload, TiffPdfRestPayload, + TranslatePdfTextPayload, UploadURLs, ) from .types import ( ALL_PDF_INFO_QUERIES, + BmpColorModel, + CompressionLevel, + ExtractTextGranularity, + FlattenQuality, + GifColorModel, + GraphicSmoothing, + JpegColorModel, + OcrLanguage, + PdfAType, PdfInfoQuery, PdfMergeInput, PdfPageSelection, PdfRedactionInstruction, PdfRGBColor, PdfXType, + PngColorModel, + SummaryFormat, + SummaryOutputFormat, + TiffColorModel, + TranslateOutputFormat, ) +__all__ = ("AsyncPdfRestClient", "PdfRestClient") +FileResponseModel = TypeVar("FileResponseModel", bound=PdfRestFileBasedResponse) + DEFAULT_BASE_URL = "https://api.pdfrest.com" API_KEY_ENV_VAR = "PDFREST_API_KEY" API_KEY_HEADER_NAME = "Api-Key" @@ -965,11 +997,12 @@ def _post_file_operation( endpoint: str, payload: dict[str, Any], payload_model: type[BaseModel], + response_model: type[FileResponseModel] = PdfRestFileBasedResponse, extra_query: Query | None = None, extra_headers: AnyMapping | None = None, extra_body: Body | None = None, timeout: TimeoutTypes | None = None, - ) -> PdfRestFileBasedResponse: + ) -> FileResponseModel: job_options = payload_model.model_validate(payload) json_body = job_options.model_dump( mode="json", by_alias=True, exclude_none=True, exclude_unset=True @@ -997,15 +1030,17 @@ def _post_file_operation( for file_id in output_ids ] - return PdfRestFileBasedResponse.model_validate( - { - "input_id": [str(file_id) for file_id in raw_response.input_id], - "output_file": [ - file.model_dump(mode="json", by_alias=True) for file in output_files - ], - "warning": raw_response.warning, - } - ) + response_payload: dict[str, Any] = { + "input_id": [str(file_id) for file_id in raw_response.input_id], + "output_file": [ + file.model_dump(mode="json", by_alias=True) for file in output_files + ], + "warning": raw_response.warning, + } + if raw_response.model_extra: + response_payload.update(raw_response.model_extra) + + return response_model.model_validate(response_payload) def send_request(self, request: _RequestModel) -> Any: return self._send_request(request) @@ -1229,11 +1264,12 @@ async def _post_file_operation( endpoint: str, payload: dict[str, Any], payload_model: type[BaseModel], + response_model: type[FileResponseModel] = PdfRestFileBasedResponse, extra_query: Query | None = None, extra_headers: AnyMapping | None = None, extra_body: Body | None = None, timeout: TimeoutTypes | None = None, - ) -> PdfRestFileBasedResponse: + ) -> FileResponseModel: job_options = payload_model.model_validate(payload) request = self.prepare_request( "POST", @@ -1269,15 +1305,17 @@ async def throttled_fetch_file_info(file_id: str) -> PdfRestFile: ) ) - return PdfRestFileBasedResponse.model_validate( - { - "input_id": [str(file_id) for file_id in raw_response.input_id], - "output_file": [ - file.model_dump(mode="json", by_alias=True) for file in output_files - ], - "warning": raw_response.warning, - } - ) + response_payload: dict[str, Any] = { + "input_id": [str(file_id) for file_id in raw_response.input_id], + "output_file": [ + file.model_dump(mode="json", by_alias=True) for file in output_files + ], + "warning": raw_response.warning, + } + if raw_response.model_extra: + response_payload.update(raw_response.model_extra) + + return response_model.model_validate(response_payload) async def send_request(self, request: _RequestModel) -> Any: return await self._send_request(request) @@ -2105,442 +2143,968 @@ def query_pdf_info( raw_payload = self._send_request(request) return PdfRestInfoResponse.model_validate(raw_payload) - def preview_redactions( + def summarize_text( self, file: PdfRestFile | Sequence[PdfRestFile], *, - redactions: PdfRedactionInstruction | Sequence[PdfRedactionInstruction], + target_word_count: int = 400, + summary_format: SummaryFormat = "overview", + pages: PdfPageSelection | None = None, + output_format: SummaryOutputFormat = "markdown", output: str | None = None, extra_query: Query | None = None, extra_headers: AnyMapping | None = None, extra_body: Body | None = None, timeout: TimeoutTypes | None = None, - ) -> PdfRestFileBasedResponse: - """Generate a PDF redaction preview with annotated redaction rectangles.""" + ) -> SummarizePdfTextResponse: + """Summarize the textual content of a PDF, Markdown, or text document. + + Always requests JSON output and returns the inline summary response defined in + the pdfRest API reference. + """ payload: dict[str, Any] = { "files": file, - "redactions": redactions, + "target_word_count": target_word_count, + "summary_format": summary_format, + "output_format": output_format, + "output_type": "json", } + if pages is not None: + payload["pages"] = pages if output is not None: payload["output"] = output - return self._post_file_operation( - endpoint="/pdf-with-redacted-text-preview", - payload=payload, - payload_model=PdfRedactionPreviewPayload, + validated_payload = SummarizePdfTextPayload.model_validate(payload) + request = self.prepare_request( + "POST", + "/summarized-pdf-text", + json_body=validated_payload.model_dump( + mode="json", by_alias=True, exclude_none=True, exclude_unset=True + ), extra_query=extra_query, extra_headers=extra_headers, extra_body=extra_body, timeout=timeout, ) + raw_payload = self._send_request(request) + return SummarizePdfTextResponse.model_validate(raw_payload) - def apply_redactions( + def summarize_text_to_file( self, file: PdfRestFile | Sequence[PdfRestFile], *, - rgb_color: PdfRGBColor | Sequence[int] | None = None, + target_word_count: int = 400, + summary_format: SummaryFormat = "overview", + pages: PdfPageSelection | None = None, + output_format: SummaryOutputFormat = "markdown", output: str | None = None, extra_query: Query | None = None, extra_headers: AnyMapping | None = None, extra_body: Body | None = None, timeout: TimeoutTypes | None = None, ) -> PdfRestFileBasedResponse: - """Apply previously previewed redactions and return the final redacted PDF.""" + """Summarize a document and return the result as a downloadable file.""" payload: dict[str, Any] = { "files": file, + "target_word_count": target_word_count, + "summary_format": summary_format, + "output_format": output_format, + "output_type": "file", } - if rgb_color is not None: - payload["rgb_color"] = rgb_color + if pages is not None: + payload["pages"] = pages if output is not None: payload["output"] = output return self._post_file_operation( - endpoint="/pdf-with-redacted-text-applied", + endpoint="/summarized-pdf-text", payload=payload, - payload_model=PdfRedactionApplyPayload, + payload_model=SummarizePdfTextPayload, extra_query=extra_query, extra_headers=extra_headers, extra_body=extra_body, timeout=timeout, ) - def split_pdf( + def convert_to_markdown( self, file: PdfRestFile | Sequence[PdfRestFile], *, - page_groups: Sequence[PdfPageSelection] | PdfPageSelection | None = None, - output_prefix: str | None = None, + pages: PdfPageSelection | None = None, + page_break_comments: bool = False, + output: str | None = None, extra_query: Query | None = None, extra_headers: AnyMapping | None = None, extra_body: Body | None = None, timeout: TimeoutTypes | None = None, ) -> PdfRestFileBasedResponse: - """Split a PDF into one or more PDF files based on the provided page groups.""" + """Convert a PDF to Markdown and return a file-based response.""" - payload: dict[str, Any] = {"files": file} - if page_groups is not None: - payload["page_groups"] = page_groups - if output_prefix is not None: - payload["output_prefix"] = output_prefix + payload: dict[str, Any] = { + "files": file, + "output_type": "file", + "page_break_comments": page_break_comments, + } + if pages is not None: + payload["pages"] = pages + if output is not None: + payload["output"] = output return self._post_file_operation( - endpoint="/split-pdf", + endpoint="/markdown", payload=payload, - payload_model=PdfSplitPayload, + payload_model=ConvertToMarkdownPayload, extra_query=extra_query, extra_headers=extra_headers, extra_body=extra_body, timeout=timeout, ) - def merge_pdfs( + def ocr_pdf( self, - sources: Sequence[PdfMergeInput], + file: PdfRestFile | Sequence[PdfRestFile], *, - output_prefix: str | None = None, + languages: OcrLanguage | Sequence[OcrLanguage] = "English", + pages: PdfPageSelection | None = None, + output: str | None = None, extra_query: Query | None = None, extra_headers: AnyMapping | None = None, extra_body: Body | None = None, timeout: TimeoutTypes | None = None, ) -> PdfRestFileBasedResponse: - """Merge multiple PDFs (or page subsets) into a single PDF file.""" + """Perform OCR on a PDF to make text searchable and extractable.""" - payload: dict[str, Any] = {"sources": sources} - if output_prefix is not None: - payload["output_prefix"] = output_prefix + payload: dict[str, Any] = {"files": file, "languages": languages} + if pages is not None: + payload["pages"] = pages + if output is not None: + payload["output"] = output return self._post_file_operation( - endpoint="/merged-pdf", + endpoint="/pdf-with-ocr-text", payload=payload, - payload_model=PdfMergePayload, + payload_model=OcrPdfPayload, extra_query=extra_query, extra_headers=extra_headers, extra_body=extra_body, timeout=timeout, ) - def convert_to_word( + def translate_pdf_text( self, file: PdfRestFile | Sequence[PdfRestFile], *, + output_language: str, + pages: PdfPageSelection | None = None, + output_format: TranslateOutputFormat = "markdown", output: str | None = None, extra_query: Query | None = None, extra_headers: AnyMapping | None = None, extra_body: Body | None = None, timeout: TimeoutTypes | None = None, - ) -> PdfRestFileBasedResponse: - """Convert a PDF to a Word document.""" + ) -> TranslatePdfTextResponse: + """Translate the textual content of a PDF, Markdown, or text document (JSON).""" - payload: dict[str, Any] = {"files": file} + payload: dict[str, Any] = { + "files": file, + "output_language": output_language, + "output_format": output_format, + "output_type": "json", + } + if pages is not None: + payload["pages"] = pages if output is not None: payload["output"] = output - return self._post_file_operation( - endpoint="/word", - payload=payload, - payload_model=PdfToWordPayload, + validated_payload = TranslatePdfTextPayload.model_validate(payload) + request = self.prepare_request( + "POST", + "/translated-pdf-text", + json_body=validated_payload.model_dump( + mode="json", by_alias=True, exclude_none=True, exclude_unset=True + ), extra_query=extra_query, extra_headers=extra_headers, extra_body=extra_body, timeout=timeout, ) + raw_payload = self._send_request(request) + return TranslatePdfTextResponse.model_validate(raw_payload) - def flatten_pdf_forms( + def translate_pdf_text_to_file( self, file: PdfRestFile | Sequence[PdfRestFile], *, + output_language: str, + pages: PdfPageSelection | None = None, + output_format: TranslateOutputFormat = "markdown", output: str | None = None, extra_query: Query | None = None, extra_headers: AnyMapping | None = None, extra_body: Body | None = None, timeout: TimeoutTypes | None = None, - ) -> PdfRestFileBasedResponse: - """Flatten form fields in a PDF so they are no longer editable.""" + ) -> TranslatePdfTextFileResponse: + """Translate textual content and receive a file-based response.""" - payload: dict[str, Any] = {"files": file} + payload: dict[str, Any] = { + "files": file, + "output_language": output_language, + "output_format": output_format, + "output_type": "file", + } + if pages is not None: + payload["pages"] = pages if output is not None: payload["output"] = output return self._post_file_operation( - endpoint="/flattened-forms-pdf", + endpoint="/translated-pdf-text", payload=payload, - payload_model=PdfFlattenFormsPayload, + payload_model=TranslatePdfTextPayload, extra_query=extra_query, extra_headers=extra_headers, extra_body=extra_body, timeout=timeout, + response_model=TranslatePdfTextFileResponse, ) - def compress_pdf( + def extract_images( self, file: PdfRestFile | Sequence[PdfRestFile], *, - compression_level: Literal["low", "medium", "high", "custom"], - profile: PdfRestFile | Sequence[PdfRestFile] | None = None, + pages: PdfPageSelection | None = None, output: str | None = None, extra_query: Query | None = None, extra_headers: AnyMapping | None = None, extra_body: Body | None = None, timeout: TimeoutTypes | None = None, ) -> PdfRestFileBasedResponse: - """Compress a PDF using preset or custom compression profiles.""" + """Extract embedded images from a PDF.""" - payload: dict[str, Any] = { - "files": file, - "compression_level": compression_level, - } - if profile is not None: - payload["profile"] = profile + payload: dict[str, Any] = {"files": file} + if pages is not None: + payload["pages"] = pages if output is not None: payload["output"] = output return self._post_file_operation( - endpoint="/compressed-pdf", + endpoint="/extracted-images", payload=payload, - payload_model=PdfCompressPayload, + payload_model=ExtractImagesPayload, extra_query=extra_query, extra_headers=extra_headers, extra_body=extra_body, timeout=timeout, ) - def convert_to_pdfx( + def extract_pdf_text_to_file( self, file: PdfRestFile | Sequence[PdfRestFile], *, - output_type: PdfXType, + pages: PdfPageSelection | None = None, + full_text: ExtractTextGranularity = "document", + preserve_line_breaks: bool = False, + word_style: bool = False, + word_coordinates: bool = False, output: str | None = None, extra_query: Query | None = None, extra_headers: AnyMapping | None = None, extra_body: Body | None = None, timeout: TimeoutTypes | None = None, ) -> PdfRestFileBasedResponse: - """Convert a PDF to a specified PDF/X version.""" + """Extract text content from a PDF and return a file-based response.""" - payload: dict[str, Any] = {"files": file, "output_type": output_type} + payload: dict[str, Any] = { + "files": file, + "full_text": full_text, + "preserve_line_breaks": preserve_line_breaks, + "word_style": word_style, + "word_coordinates": word_coordinates, + "output_type": "file", + } + if pages is not None: + payload["pages"] = pages if output is not None: payload["output"] = output return self._post_file_operation( - endpoint="/pdfx", + endpoint="/extracted-text", payload=payload, - payload_model=PdfToPdfxPayload, + payload_model=ExtractTextPayload, extra_query=extra_query, extra_headers=extra_headers, extra_body=extra_body, timeout=timeout, ) - def convert_to_png( + def preview_redactions( self, - files: PdfRestFile | Sequence[PdfRestFile], + file: PdfRestFile | Sequence[PdfRestFile], *, - output_prefix: str | None = None, - page_range: str | Sequence[str] | None = None, - resolution: int = 300, - color_model: Literal["rgb", "rgba", "gray"] = "rgb", - smoothing: Literal["none", "all", "text", "line", "image"] - | Sequence[Literal["none", "all", "text", "line", "image"]] - | None = None, + redactions: PdfRedactionInstruction | Sequence[PdfRedactionInstruction], + output: str | None = None, extra_query: Query | None = None, extra_headers: AnyMapping | None = None, extra_body: Body | None = None, timeout: TimeoutTypes | None = None, ) -> PdfRestFileBasedResponse: - """Convert one or more pdfRest files to PNG images.""" + """Generate a PDF redaction preview with annotated redaction rectangles.""" payload: dict[str, Any] = { - "files": files, - "resolution": resolution, - "color_model": color_model, + "files": file, + "redactions": redactions, } - if output_prefix is not None: - payload["output_prefix"] = output_prefix - if page_range is not None: - payload["page_range"] = page_range - if smoothing is not None: - payload["smoothing"] = smoothing + if output is not None: + payload["output"] = output - return self._convert_to_graphic( - endpoint="/png", + return self._post_file_operation( + endpoint="/pdf-with-redacted-text-preview", payload=payload, - payload_model=PngPdfRestPayload, + payload_model=PdfRedactionPreviewPayload, extra_query=extra_query, extra_headers=extra_headers, extra_body=extra_body, timeout=timeout, ) - def convert_to_bmp( + def apply_redactions( self, - files: PdfRestFile | Sequence[PdfRestFile], + file: PdfRestFile | Sequence[PdfRestFile], *, - output_prefix: str | None = None, - page_range: str | Sequence[str] | None = None, - resolution: int = 300, - color_model: Literal["rgb", "gray"] = "rgb", - smoothing: Literal["none", "all", "text", "line", "image"] - | Sequence[Literal["none", "all", "text", "line", "image"]] - | None = None, + rgb_color: PdfRGBColor | Sequence[int] | None = None, + output: str | None = None, extra_query: Query | None = None, extra_headers: AnyMapping | None = None, extra_body: Body | None = None, timeout: TimeoutTypes | None = None, ) -> PdfRestFileBasedResponse: - """Convert one or more pdfRest files to BMP images.""" + """Apply previously previewed redactions and return the final redacted PDF.""" payload: dict[str, Any] = { - "files": files, - "resolution": resolution, - "color_model": color_model, + "files": file, } - if output_prefix is not None: - payload["output_prefix"] = output_prefix - if page_range is not None: - payload["page_range"] = page_range - if smoothing is not None: - payload["smoothing"] = smoothing + if rgb_color is not None: + payload["rgb_color"] = rgb_color + if output is not None: + payload["output"] = output - return self._convert_to_graphic( - endpoint="/bmp", + return self._post_file_operation( + endpoint="/pdf-with-redacted-text-applied", payload=payload, - payload_model=BmpPdfRestPayload, + payload_model=PdfRedactionApplyPayload, extra_query=extra_query, extra_headers=extra_headers, extra_body=extra_body, timeout=timeout, ) - def convert_to_gif( + def split_pdf( self, - files: PdfRestFile | Sequence[PdfRestFile], + file: PdfRestFile | Sequence[PdfRestFile], *, + page_groups: Sequence[PdfPageSelection] | PdfPageSelection | None = None, output_prefix: str | None = None, - page_range: str | Sequence[str] | None = None, - resolution: int = 300, - color_model: Literal["rgb", "gray"] = "rgb", - smoothing: Literal["none", "all", "text", "line", "image"] - | Sequence[Literal["none", "all", "text", "line", "image"]] - | None = None, extra_query: Query | None = None, extra_headers: AnyMapping | None = None, extra_body: Body | None = None, timeout: TimeoutTypes | None = None, ) -> PdfRestFileBasedResponse: - """Convert one or more pdfRest files to GIF images.""" + """Split a PDF into one or more PDF files based on the provided page groups.""" - payload: dict[str, Any] = { - "files": files, - "resolution": resolution, - "color_model": color_model, - } + payload: dict[str, Any] = {"files": file} + if page_groups is not None: + payload["page_groups"] = page_groups if output_prefix is not None: payload["output_prefix"] = output_prefix - if page_range is not None: - payload["page_range"] = page_range - if smoothing is not None: - payload["smoothing"] = smoothing - return self._convert_to_graphic( - endpoint="/gif", + return self._post_file_operation( + endpoint="/split-pdf", payload=payload, - payload_model=GifPdfRestPayload, + payload_model=PdfSplitPayload, extra_query=extra_query, extra_headers=extra_headers, extra_body=extra_body, timeout=timeout, ) - def convert_to_jpeg( + def merge_pdfs( self, - files: PdfRestFile | Sequence[PdfRestFile], + sources: Sequence[PdfMergeInput], *, output_prefix: str | None = None, - page_range: str | Sequence[str] | None = None, - resolution: int = 300, - color_model: Literal["rgb", "cmyk", "gray"] = "rgb", - smoothing: Literal["none", "all", "text", "line", "image"] - | Sequence[Literal["none", "all", "text", "line", "image"]] - | None = None, - jpeg_quality: int | None = None, extra_query: Query | None = None, extra_headers: AnyMapping | None = None, extra_body: Body | None = None, timeout: TimeoutTypes | None = None, ) -> PdfRestFileBasedResponse: - """Convert one or more pdfRest files to JPEG images.""" + """Merge multiple PDFs (or page subsets) into a single PDF file.""" - payload: dict[str, Any] = { - "files": files, - "resolution": resolution, - "color_model": color_model, - } + payload: dict[str, Any] = {"sources": sources} if output_prefix is not None: payload["output_prefix"] = output_prefix - if page_range is not None: - payload["page_range"] = page_range - if smoothing is not None: - payload["smoothing"] = smoothing - if jpeg_quality is not None: - payload["jpeg_quality"] = jpeg_quality - return self._convert_to_graphic( - endpoint="/jpg", + return self._post_file_operation( + endpoint="/merged-pdf", payload=payload, - payload_model=JpegPdfRestPayload, + payload_model=PdfMergePayload, extra_query=extra_query, extra_headers=extra_headers, extra_body=extra_body, timeout=timeout, ) - def convert_to_tiff( + def convert_to_excel( self, - files: PdfRestFile | Sequence[PdfRestFile], + file: PdfRestFile | Sequence[PdfRestFile], *, - output_prefix: str | None = None, - page_range: str | Sequence[str] | None = None, - resolution: int = 300, - color_model: Literal["rgb", "rgba", "cmyk", "lab", "gray"] = "rgb", - smoothing: Literal["none", "all", "text", "line", "image"] - | Sequence[Literal["none", "all", "text", "line", "image"]] - | None = None, + output: str | None = None, extra_query: Query | None = None, extra_headers: AnyMapping | None = None, extra_body: Body | None = None, timeout: TimeoutTypes | None = None, ) -> PdfRestFileBasedResponse: - """Convert one or more pdfRest files to TIFF images.""" + """Convert a PDF to an Excel spreadsheet.""" - payload: dict[str, Any] = { - "files": files, - "resolution": resolution, - "color_model": color_model, - } - if output_prefix is not None: - payload["output_prefix"] = output_prefix - if page_range is not None: - payload["page_range"] = page_range - if smoothing is not None: - payload["smoothing"] = smoothing + payload: dict[str, Any] = {"files": file} + if output is not None: + payload["output"] = output - return self._convert_to_graphic( - endpoint="/tif", + return self._post_file_operation( + endpoint="/excel", payload=payload, - payload_model=TiffPdfRestPayload, + payload_model=PdfToExcelPayload, extra_query=extra_query, extra_headers=extra_headers, extra_body=extra_body, timeout=timeout, ) - -class AsyncPdfRestClient(_AsyncApiClient): - """Asynchronous client for interacting with the pdfrest API.""" - - def __init__( + def convert_to_powerpoint( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Convert a PDF to a PowerPoint presentation.""" + + payload: dict[str, Any] = {"files": file} + if output is not None: + payload["output"] = output + + return self._post_file_operation( + endpoint="/powerpoint", + payload=payload, + payload_model=PdfToPowerpointPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + def convert_xfa_to_acroforms( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Convert an XFA PDF to an AcroForm-enabled PDF.""" + + payload: dict[str, Any] = {"files": file} + if output is not None: + payload["output"] = output + + return self._post_file_operation( + endpoint="/pdf-with-acroforms", + payload=payload, + payload_model=PdfXfaToAcroformsPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + def convert_to_word( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Convert a PDF to a Word document.""" + + payload: dict[str, Any] = {"files": file} + if output is not None: + payload["output"] = output + + return self._post_file_operation( + endpoint="/word", + payload=payload, + payload_model=PdfToWordPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + def flatten_pdf_forms( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Flatten form fields in a PDF so they are no longer editable.""" + + payload: dict[str, Any] = {"files": file} + if output is not None: + payload["output"] = output + + return self._post_file_operation( + endpoint="/flattened-forms-pdf", + payload=payload, + payload_model=PdfFlattenFormsPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + def compress_pdf( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + compression_level: CompressionLevel, + profile: PdfRestFile | Sequence[PdfRestFile] | None = None, + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Compress a PDF using preset or custom compression profiles.""" + + payload: dict[str, Any] = { + "files": file, + "compression_level": compression_level, + } + if profile is not None: + payload["profile"] = profile + if output is not None: + payload["output"] = output + + return self._post_file_operation( + endpoint="/compressed-pdf", + payload=payload, + payload_model=PdfCompressPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + def add_attachment_to_pdf( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + attachment: PdfRestFile | Sequence[PdfRestFile], + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Attach an uploaded file to a PDF.""" + + payload: dict[str, Any] = {"files": file, "attachment": attachment} + if output is not None: + payload["output"] = output + + return self._post_file_operation( + endpoint="/pdf-with-added-attachment", + payload=payload, + payload_model=PdfAddAttachmentPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + def flatten_transparencies( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + output: str | None = None, + quality: FlattenQuality = "medium", + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Flatten transparent objects in a PDF.""" + + payload: dict[str, Any] = {"files": file, "quality": quality} + if output is not None: + payload["output"] = output + + return self._post_file_operation( + endpoint="/flattened-transparencies-pdf", + payload=payload, + payload_model=PdfFlattenTransparenciesPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + def linearize_pdf( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Linearize a PDF for optimized fast web view.""" + + payload: dict[str, Any] = {"files": file} + if output is not None: + payload["output"] = output + + return self._post_file_operation( + endpoint="/linearized-pdf", + payload=payload, + payload_model=PdfLinearizePayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + def flatten_annotations( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Flatten annotations into the PDF content.""" + + payload: dict[str, Any] = {"files": file} + if output is not None: + payload["output"] = output + + return self._post_file_operation( + endpoint="/flattened-annotations-pdf", + payload=payload, + payload_model=PdfFlattenAnnotationsPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + def rasterize_pdf( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Rasterize a PDF into a flattened bitmap-based PDF.""" + + payload: dict[str, Any] = {"files": file} + if output is not None: + payload["output"] = output + + return self._post_file_operation( + endpoint="/rasterized-pdf", + payload=payload, + payload_model=PdfRasterizePayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + def convert_to_pdfa( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + output_type: PdfAType, + output: str | None = None, + rasterize_if_errors_encountered: bool = False, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Convert a PDF to a specified PDF/A version.""" + + payload: dict[str, Any] = { + "files": file, + "output_type": output_type, + "rasterize_if_errors_encountered": rasterize_if_errors_encountered, + } + if output is not None: + payload["output"] = output + return self._post_file_operation( + endpoint="/pdfa", + payload=payload, + payload_model=PdfToPdfaPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + def convert_to_pdfx( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + output_type: PdfXType, + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Convert a PDF to a specified PDF/X version.""" + + payload: dict[str, Any] = {"files": file, "output_type": output_type} + if output is not None: + payload["output"] = output + + return self._post_file_operation( + endpoint="/pdfx", + payload=payload, + payload_model=PdfToPdfxPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + def convert_to_png( + self, + files: PdfRestFile | Sequence[PdfRestFile], + *, + output_prefix: str | None = None, + page_range: str | Sequence[str] | None = None, + resolution: int = 300, + color_model: PngColorModel = "rgb", + smoothing: GraphicSmoothing | Sequence[GraphicSmoothing] | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Convert one or more pdfRest files to PNG images.""" + + payload: dict[str, Any] = { + "files": files, + "resolution": resolution, + "color_model": color_model, + } + if output_prefix is not None: + payload["output_prefix"] = output_prefix + if page_range is not None: + payload["page_range"] = page_range + if smoothing is not None: + payload["smoothing"] = smoothing + + return self._convert_to_graphic( + endpoint="/png", + payload=payload, + payload_model=PngPdfRestPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + def convert_to_bmp( + self, + files: PdfRestFile | Sequence[PdfRestFile], + *, + output_prefix: str | None = None, + page_range: str | Sequence[str] | None = None, + resolution: int = 300, + color_model: BmpColorModel = "rgb", + smoothing: GraphicSmoothing | Sequence[GraphicSmoothing] | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Convert one or more pdfRest files to BMP images.""" + + payload: dict[str, Any] = { + "files": files, + "resolution": resolution, + "color_model": color_model, + } + if output_prefix is not None: + payload["output_prefix"] = output_prefix + if page_range is not None: + payload["page_range"] = page_range + if smoothing is not None: + payload["smoothing"] = smoothing + + return self._convert_to_graphic( + endpoint="/bmp", + payload=payload, + payload_model=BmpPdfRestPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + def convert_to_gif( + self, + files: PdfRestFile | Sequence[PdfRestFile], + *, + output_prefix: str | None = None, + page_range: str | Sequence[str] | None = None, + resolution: int = 300, + color_model: GifColorModel = "rgb", + smoothing: GraphicSmoothing | Sequence[GraphicSmoothing] | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Convert one or more pdfRest files to GIF images.""" + + payload: dict[str, Any] = { + "files": files, + "resolution": resolution, + "color_model": color_model, + } + if output_prefix is not None: + payload["output_prefix"] = output_prefix + if page_range is not None: + payload["page_range"] = page_range + if smoothing is not None: + payload["smoothing"] = smoothing + + return self._convert_to_graphic( + endpoint="/gif", + payload=payload, + payload_model=GifPdfRestPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + def convert_to_jpeg( + self, + files: PdfRestFile | Sequence[PdfRestFile], + *, + output_prefix: str | None = None, + page_range: str | Sequence[str] | None = None, + resolution: int = 300, + color_model: JpegColorModel = "rgb", + smoothing: GraphicSmoothing | Sequence[GraphicSmoothing] | None = None, + jpeg_quality: int = 75, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Convert one or more pdfRest files to JPEG images.""" + + payload: dict[str, Any] = { + "files": files, + "resolution": resolution, + "color_model": color_model, + "jpeg_quality": jpeg_quality, + } + if output_prefix is not None: + payload["output_prefix"] = output_prefix + if page_range is not None: + payload["page_range"] = page_range + if smoothing is not None: + payload["smoothing"] = smoothing + + return self._convert_to_graphic( + endpoint="/jpg", + payload=payload, + payload_model=JpegPdfRestPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + def convert_to_tiff( + self, + files: PdfRestFile | Sequence[PdfRestFile], + *, + output_prefix: str | None = None, + page_range: str | Sequence[str] | None = None, + resolution: int = 300, + color_model: TiffColorModel = "rgb", + smoothing: GraphicSmoothing | Sequence[GraphicSmoothing] | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Convert one or more pdfRest files to TIFF images.""" + + payload: dict[str, Any] = { + "files": files, + "resolution": resolution, + "color_model": color_model, + } + if output_prefix is not None: + payload["output_prefix"] = output_prefix + if page_range is not None: + payload["page_range"] = page_range + if smoothing is not None: + payload["smoothing"] = smoothing + + return self._convert_to_graphic( + endpoint="/tif", + payload=payload, + payload_model=TiffPdfRestPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + +class AsyncPdfRestClient(_AsyncApiClient): + """Asynchronous client for interacting with the pdfrest API.""" + + def __init__( self, *, api_key: str | None = None, @@ -2558,53 +3122,349 @@ def __init__( api_key=api_key, base_url=base_url, timeout=timeout, - headers=headers, - http_client=http_client, - transport=transport, - concurrency_limit=concurrency_limit, - max_retries=max_retries, + headers=headers, + http_client=http_client, + transport=transport, + concurrency_limit=concurrency_limit, + max_retries=max_retries, + ) + self._files_client = _AsyncFilesClient(self) + + @override + async def __aenter__(self) -> AsyncPdfRestClient: + _ = await super().__aenter__() + return self + + @override + async def __aexit__(self, exc_type: Any, exc: Any, traceback: Any) -> None: + await super().__aexit__(exc_type, exc, traceback) + + @property + def files(self) -> _AsyncFilesClient: + return self._files_client + + async def query_pdf_info( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + queries: Sequence[PdfInfoQuery] | PdfInfoQuery = ALL_PDF_INFO_QUERIES, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestInfoResponse: + """Query pdfRest for metadata describing a PDF document asynchronously.""" + + payload = PdfInfoPayload.model_validate({"file": file, "queries": queries}) + request = self.prepare_request( + "POST", + "/pdf-info", + json_body=payload.model_dump( + mode="json", by_alias=True, exclude_none=True, exclude_defaults=True + ), + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + raw_payload = await self._send_request(request) + return PdfRestInfoResponse.model_validate(raw_payload) + + async def summarize_text( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + target_word_count: int = 400, + summary_format: SummaryFormat = "overview", + pages: PdfPageSelection | None = None, + output_format: SummaryOutputFormat = "markdown", + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> SummarizePdfTextResponse: + """Summarize the textual content of a PDF, Markdown, or text document. + + Always requests JSON output and returns the inline summary response defined in + the pdfRest API reference. + """ + + payload: dict[str, Any] = { + "files": file, + "target_word_count": target_word_count, + "summary_format": summary_format, + "output_format": output_format, + "output_type": "json", + } + if pages is not None: + payload["pages"] = pages + if output is not None: + payload["output"] = output + + validated_payload = SummarizePdfTextPayload.model_validate(payload) + request = self.prepare_request( + "POST", + "/summarized-pdf-text", + json_body=validated_payload.model_dump( + mode="json", by_alias=True, exclude_none=True, exclude_unset=True + ), + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + raw_payload = await self._send_request(request) + return SummarizePdfTextResponse.model_validate(raw_payload) + + async def summarize_text_to_file( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + target_word_count: int = 400, + summary_format: SummaryFormat = "overview", + pages: PdfPageSelection | None = None, + output_format: SummaryOutputFormat = "markdown", + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Summarize a document and return the result as a downloadable file.""" + + payload: dict[str, Any] = { + "files": file, + "target_word_count": target_word_count, + "summary_format": summary_format, + "output_format": output_format, + "output_type": "file", + } + if pages is not None: + payload["pages"] = pages + if output is not None: + payload["output"] = output + + return await self._post_file_operation( + endpoint="/summarized-pdf-text", + payload=payload, + payload_model=SummarizePdfTextPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + async def convert_to_markdown( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + pages: PdfPageSelection | None = None, + page_break_comments: bool = False, + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Convert a PDF to Markdown and return a file-based response.""" + + payload: dict[str, Any] = { + "files": file, + "output_type": "file", + "page_break_comments": page_break_comments, + } + if pages is not None: + payload["pages"] = pages + if output is not None: + payload["output"] = output + + return await self._post_file_operation( + endpoint="/markdown", + payload=payload, + payload_model=ConvertToMarkdownPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + async def ocr_pdf( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + languages: OcrLanguage | Sequence[OcrLanguage] = "English", + pages: PdfPageSelection | None = None, + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Perform OCR on a PDF to make text searchable and extractable.""" + + payload: dict[str, Any] = {"files": file, "languages": languages} + if pages is not None: + payload["pages"] = pages + if output is not None: + payload["output"] = output + + return await self._post_file_operation( + endpoint="/pdf-with-ocr-text", + payload=payload, + payload_model=OcrPdfPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + async def translate_pdf_text( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + output_language: str, + pages: PdfPageSelection | None = None, + output_format: TranslateOutputFormat = "markdown", + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> TranslatePdfTextResponse: + """Translate the textual content of a PDF, Markdown, or text document (JSON).""" + + payload: dict[str, Any] = { + "files": file, + "output_language": output_language, + "output_format": output_format, + "output_type": "json", + } + if pages is not None: + payload["pages"] = pages + if output is not None: + payload["output"] = output + + validated_payload = TranslatePdfTextPayload.model_validate(payload) + request = self.prepare_request( + "POST", + "/translated-pdf-text", + json_body=validated_payload.model_dump( + mode="json", by_alias=True, exclude_none=True, exclude_unset=True + ), + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + raw_payload = await self._send_request(request) + return TranslatePdfTextResponse.model_validate(raw_payload) + + async def translate_pdf_text_to_file( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + output_language: str, + pages: PdfPageSelection | None = None, + output_format: TranslateOutputFormat = "markdown", + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> TranslatePdfTextFileResponse: + """Translate textual content and receive a file-based response.""" + + payload: dict[str, Any] = { + "files": file, + "output_language": output_language, + "output_format": output_format, + "output_type": "file", + } + if pages is not None: + payload["pages"] = pages + if output is not None: + payload["output"] = output + + return await self._post_file_operation( + endpoint="/translated-pdf-text", + payload=payload, + payload_model=TranslatePdfTextPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + response_model=TranslatePdfTextFileResponse, ) - self._files_client = _AsyncFilesClient(self) - @override - async def __aenter__(self) -> AsyncPdfRestClient: - _ = await super().__aenter__() - return self + async def extract_images( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + pages: PdfPageSelection | None = None, + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Extract embedded images from a PDF.""" - @override - async def __aexit__(self, exc_type: Any, exc: Any, traceback: Any) -> None: - await super().__aexit__(exc_type, exc, traceback) + payload: dict[str, Any] = {"files": file} + if pages is not None: + payload["pages"] = pages + if output is not None: + payload["output"] = output - @property - def files(self) -> _AsyncFilesClient: - return self._files_client + return await self._post_file_operation( + endpoint="/extracted-images", + payload=payload, + payload_model=ExtractImagesPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) - async def query_pdf_info( + async def extract_pdf_text_to_file( self, file: PdfRestFile | Sequence[PdfRestFile], *, - queries: Sequence[PdfInfoQuery] | PdfInfoQuery = ALL_PDF_INFO_QUERIES, + pages: PdfPageSelection | None = None, + full_text: ExtractTextGranularity = "document", + preserve_line_breaks: bool = False, + word_style: bool = False, + word_coordinates: bool = False, + output: str | None = None, extra_query: Query | None = None, extra_headers: AnyMapping | None = None, extra_body: Body | None = None, timeout: TimeoutTypes | None = None, - ) -> PdfRestInfoResponse: - """Query pdfRest for metadata describing a PDF document asynchronously.""" + ) -> PdfRestFileBasedResponse: + """Extract text content from a PDF and return a file-based response.""" - payload = PdfInfoPayload.model_validate({"file": file, "queries": queries}) - request = self.prepare_request( - "POST", - "/pdf-info", - json_body=payload.model_dump( - mode="json", by_alias=True, exclude_none=True, exclude_defaults=True - ), + payload: dict[str, Any] = { + "files": file, + "full_text": full_text, + "preserve_line_breaks": preserve_line_breaks, + "word_style": word_style, + "word_coordinates": word_coordinates, + "output_type": "file", + } + if pages is not None: + payload["pages"] = pages + if output is not None: + payload["output"] = output + + return await self._post_file_operation( + endpoint="/extracted-text", + payload=payload, + payload_model=ExtractTextPayload, extra_query=extra_query, extra_headers=extra_headers, extra_body=extra_body, timeout=timeout, ) - raw_payload = await self._send_request(request) - return PdfRestInfoResponse.model_validate(raw_payload) async def preview_redactions( self, @@ -2764,6 +3624,84 @@ async def merge_pdfs( timeout=timeout, ) + async def convert_to_excel( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Asynchronously convert a PDF to an Excel spreadsheet.""" + + payload: dict[str, Any] = {"files": file} + if output is not None: + payload["output"] = output + + return await self._post_file_operation( + endpoint="/excel", + payload=payload, + payload_model=PdfToExcelPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + async def convert_to_powerpoint( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Asynchronously convert a PDF to a PowerPoint presentation.""" + + payload: dict[str, Any] = {"files": file} + if output is not None: + payload["output"] = output + + return await self._post_file_operation( + endpoint="/powerpoint", + payload=payload, + payload_model=PdfToPowerpointPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + async def convert_xfa_to_acroforms( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Asynchronously convert an XFA PDF to an AcroForm-enabled PDF.""" + + payload: dict[str, Any] = {"files": file} + if output is not None: + payload["output"] = output + + return await self._post_file_operation( + endpoint="/pdf-with-acroforms", + payload=payload, + payload_model=PdfXfaToAcroformsPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + async def convert_to_word( self, file: PdfRestFile | Sequence[PdfRestFile], @@ -2820,7 +3758,7 @@ async def compress_pdf( self, file: PdfRestFile | Sequence[PdfRestFile], *, - compression_level: Literal["low", "medium", "high", "custom"], + compression_level: CompressionLevel, profile: PdfRestFile | Sequence[PdfRestFile] | None = None, output: str | None = None, extra_query: Query | None = None, @@ -2849,6 +3787,170 @@ async def compress_pdf( timeout=timeout, ) + async def add_attachment_to_pdf( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + attachment: PdfRestFile | Sequence[PdfRestFile], + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Asynchronously attach an uploaded file to a PDF.""" + + payload: dict[str, Any] = {"files": file, "attachment": attachment} + if output is not None: + payload["output"] = output + + return await self._post_file_operation( + endpoint="/pdf-with-added-attachment", + payload=payload, + payload_model=PdfAddAttachmentPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + async def flatten_transparencies( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + output: str | None = None, + quality: FlattenQuality = "medium", + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Asynchronously flatten transparent objects in a PDF.""" + + payload: dict[str, Any] = {"files": file, "quality": quality} + if output is not None: + payload["output"] = output + + return await self._post_file_operation( + endpoint="/flattened-transparencies-pdf", + payload=payload, + payload_model=PdfFlattenTransparenciesPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + async def linearize_pdf( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Asynchronously linearize a PDF for optimized fast web view.""" + + payload: dict[str, Any] = {"files": file} + if output is not None: + payload["output"] = output + + return await self._post_file_operation( + endpoint="/linearized-pdf", + payload=payload, + payload_model=PdfLinearizePayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + async def flatten_annotations( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Asynchronously flatten annotations into the PDF content.""" + + payload: dict[str, Any] = {"files": file} + if output is not None: + payload["output"] = output + + return await self._post_file_operation( + endpoint="/flattened-annotations-pdf", + payload=payload, + payload_model=PdfFlattenAnnotationsPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + async def rasterize_pdf( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + output: str | None = None, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Asynchronously rasterize a PDF into a flattened bitmap-based PDF.""" + + payload: dict[str, Any] = {"files": file} + if output is not None: + payload["output"] = output + + return await self._post_file_operation( + endpoint="/rasterized-pdf", + payload=payload, + payload_model=PdfRasterizePayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + + async def convert_to_pdfa( + self, + file: PdfRestFile | Sequence[PdfRestFile], + *, + output_type: PdfAType, + output: str | None = None, + rasterize_if_errors_encountered: bool = False, + extra_query: Query | None = None, + extra_headers: AnyMapping | None = None, + extra_body: Body | None = None, + timeout: TimeoutTypes | None = None, + ) -> PdfRestFileBasedResponse: + """Asynchronously convert a PDF to a specified PDF/A version.""" + + payload: dict[str, Any] = { + "files": file, + "output_type": output_type, + "rasterize_if_errors_encountered": rasterize_if_errors_encountered, + } + if output is not None: + payload["output"] = output + + return await self._post_file_operation( + endpoint="/pdfa", + payload=payload, + payload_model=PdfToPdfaPayload, + extra_query=extra_query, + extra_headers=extra_headers, + extra_body=extra_body, + timeout=timeout, + ) + async def convert_to_pdfx( self, file: PdfRestFile | Sequence[PdfRestFile], @@ -2883,10 +3985,8 @@ async def convert_to_png( output_prefix: str | None = None, page_range: str | Sequence[str] | None = None, resolution: int = 300, - color_model: Literal["rgb", "rgba", "gray"] = "rgb", - smoothing: Literal["none", "all", "text", "line", "image"] - | Sequence[Literal["none", "all", "text", "line", "image"]] - | None = None, + color_model: PngColorModel = "rgb", + smoothing: GraphicSmoothing | Sequence[GraphicSmoothing] | None = None, extra_query: Query | None = None, extra_headers: AnyMapping | None = None, extra_body: Body | None = None, @@ -2923,10 +4023,8 @@ async def convert_to_bmp( output_prefix: str | None = None, page_range: str | Sequence[str] | None = None, resolution: int = 300, - color_model: Literal["rgb", "gray"] = "rgb", - smoothing: Literal["none", "all", "text", "line", "image"] - | Sequence[Literal["none", "all", "text", "line", "image"]] - | None = None, + color_model: BmpColorModel = "rgb", + smoothing: GraphicSmoothing | Sequence[GraphicSmoothing] | None = None, extra_query: Query | None = None, extra_headers: AnyMapping | None = None, extra_body: Body | None = None, @@ -2963,10 +4061,8 @@ async def convert_to_gif( output_prefix: str | None = None, page_range: str | Sequence[str] | None = None, resolution: int = 300, - color_model: Literal["rgb", "gray"] = "rgb", - smoothing: Literal["none", "all", "text", "line", "image"] - | Sequence[Literal["none", "all", "text", "line", "image"]] - | None = None, + color_model: GifColorModel = "rgb", + smoothing: GraphicSmoothing | Sequence[GraphicSmoothing] | None = None, extra_query: Query | None = None, extra_headers: AnyMapping | None = None, extra_body: Body | None = None, @@ -3003,11 +4099,9 @@ async def convert_to_jpeg( output_prefix: str | None = None, page_range: str | Sequence[str] | None = None, resolution: int = 300, - color_model: Literal["rgb", "cmyk", "gray"] = "rgb", - smoothing: Literal["none", "all", "text", "line", "image"] - | Sequence[Literal["none", "all", "text", "line", "image"]] - | None = None, - jpeg_quality: int | None = None, + color_model: JpegColorModel = "rgb", + smoothing: GraphicSmoothing | Sequence[GraphicSmoothing] | None = None, + jpeg_quality: int = 75, extra_query: Query | None = None, extra_headers: AnyMapping | None = None, extra_body: Body | None = None, @@ -3019,6 +4113,7 @@ async def convert_to_jpeg( "files": files, "resolution": resolution, "color_model": color_model, + "jpeg_quality": jpeg_quality, } if output_prefix is not None: payload["output_prefix"] = output_prefix @@ -3026,8 +4121,6 @@ async def convert_to_jpeg( payload["page_range"] = page_range if smoothing is not None: payload["smoothing"] = smoothing - if jpeg_quality is not None: - payload["jpeg_quality"] = jpeg_quality return await self._convert_to_graphic( endpoint="/jpg", @@ -3046,10 +4139,8 @@ async def convert_to_tiff( output_prefix: str | None = None, page_range: str | Sequence[str] | None = None, resolution: int = 300, - color_model: Literal["rgb", "rgba", "cmyk", "lab", "gray"] = "rgb", - smoothing: Literal["none", "all", "text", "line", "image"] - | Sequence[Literal["none", "all", "text", "line", "image"]] - | None = None, + color_model: TiffColorModel = "rgb", + smoothing: GraphicSmoothing | Sequence[GraphicSmoothing] | None = None, extra_query: Query | None = None, extra_headers: AnyMapping | None = None, extra_body: Body | None = None, diff --git a/src/pdfrest/models/__init__.py b/src/pdfrest/models/__init__.py index 54c9aeb4..ef10e565 100644 --- a/src/pdfrest/models/__init__.py +++ b/src/pdfrest/models/__init__.py @@ -5,6 +5,9 @@ PdfRestFileBasedResponse, PdfRestFileID, PdfRestInfoResponse, + SummarizePdfTextResponse, + TranslatePdfTextFileResponse, + TranslatePdfTextResponse, UpResponse, ) @@ -15,5 +18,8 @@ "PdfRestFileBasedResponse", "PdfRestFileID", "PdfRestInfoResponse", + "SummarizePdfTextResponse", + "TranslatePdfTextFileResponse", + "TranslatePdfTextResponse", "UpResponse", ] diff --git a/src/pdfrest/models/_internal.py b/src/pdfrest/models/_internal.py index 33cb8747..25d0a3fc 100644 --- a/src/pdfrest/models/_internal.py +++ b/src/pdfrest/models/_internal.py @@ -6,6 +6,7 @@ from pathlib import PurePath from typing import Annotated, Any, Generic, Literal, TypeVar +from langcodes import tag_is_valid from pydantic import ( AfterValidator, AliasChoices, @@ -21,7 +22,16 @@ from pdfrest.types.public import PdfRedactionPreset -from ..types import PdfInfoQuery, PdfXType +from ..types import ( + OcrLanguage, + PdfAType, + PdfInfoQuery, + PdfXType, + SummaryFormat, + SummaryOutputFormat, + SummaryOutputType, + TranslateOutputFormat, +) from . import PdfRestFile from .public import PdfRestFileID @@ -112,6 +122,12 @@ def _serialize_file_ids(value: list[PdfRestFile]) -> str: return ",".join(str(file.id) for file in value) +def _bool_to_on_off(value: Any) -> Any: + if isinstance(value, bool): + return "on" if value else "off" + return value + + def _serialize_page_ranges(value: list[str | int | tuple[str | int, ...]]) -> str: def join_tuple(value: str | int | tuple[str | int, ...]) -> str: if isinstance(value, tuple): @@ -160,6 +176,45 @@ def _int_to_string(value: Any) -> Any: return value +_OUTPUT_LANGUAGE_ERROR = ( + "The provided 'output_language' language tag is invalid. Format 'output_language' as " + "a valid 2-3 character ISO 639 language code (e.g., 'en', 'es', 'fra'), optionally " + "with a script, alphabetic region, or numeric region (e.g., 'zh-Hant', 'eng-US', " + "'es-419'). See documentation for recommended formats." +) + + +def _validate_output_language(value: str) -> str: + if not value: + raise ValueError(_OUTPUT_LANGUAGE_ERROR) + + trimmed = value.strip() + if not trimmed: + raise ValueError(_OUTPUT_LANGUAGE_ERROR) + + segments = trimmed.split("-") + if len(segments) > 2: + raise ValueError(_OUTPUT_LANGUAGE_ERROR) + + language = segments[0] + if not re.fullmatch(r"[A-Za-z]{2,3}", language): + raise ValueError(_OUTPUT_LANGUAGE_ERROR) + + if len(segments) == 2: + subtag = segments[1] + if not ( + re.fullmatch(r"[A-Za-z]{4}", subtag) + or re.fullmatch(r"[A-Za-z]{2}", subtag) + or re.fullmatch(r"[0-9]{3}", subtag) + ): + raise ValueError(_OUTPUT_LANGUAGE_ERROR) + + if not tag_is_valid(trimmed): + raise ValueError(_OUTPUT_LANGUAGE_ERROR) + + return trimmed + + class UploadURLs(BaseModel): url: Annotated[ list[HttpUrl] | HttpUrl, @@ -248,6 +303,265 @@ class PdfInfoPayload(BaseModel): ] +class SummarizePdfTextPayload(BaseModel): + """Adapt caller options into a pdfRest-ready summarize request payload.""" + + files: Annotated[ + list[PdfRestFile], + Field( + min_length=1, + max_length=1, + validation_alias=AliasChoices("file", "files"), + serialization_alias="id", + ), + BeforeValidator(_ensure_list), + AfterValidator( + _allowed_mime_types( + "application/pdf", + "text/markdown", + "text/plain", + error_msg="Must be a PDF, Markdown, or plain text file", + ) + ), + PlainSerializer(_serialize_as_first_file_id), + ] + target_word_count: Annotated[ + int | None, Field(serialization_alias="target_word_count", ge=1, default=400) + ] = 400 + summary_format: Annotated[ + SummaryFormat, Field(serialization_alias="summary_format", default="overview") + ] = "overview" + pages: Annotated[ + list[AscendingPageRange] | None, + Field(serialization_alias="pages", min_length=1, default=None), + BeforeValidator(_ensure_list), + BeforeValidator(_split_comma_list), + BeforeValidator(_int_to_string), + PlainSerializer(_serialize_page_ranges), + ] = None + output_format: Annotated[ + SummaryOutputFormat, + Field(serialization_alias="output_format", default="markdown"), + ] = "markdown" + output_type: Annotated[ + SummaryOutputType, Field(serialization_alias="output_type", default="json") + ] = "json" + output: Annotated[ + str | None, + Field(serialization_alias="output", min_length=1, default=None), + AfterValidator(_validate_output_prefix), + ] = None + + +class OcrPdfPayload(BaseModel): + """Adapt caller options into a pdfRest-ready OCR request payload.""" + + files: Annotated[ + list[PdfRestFile], + Field( + min_length=1, + max_length=1, + validation_alias=AliasChoices("file", "files"), + serialization_alias="id", + ), + BeforeValidator(_ensure_list), + AfterValidator( + _allowed_mime_types("application/pdf", error_msg="Must be a PDF file") + ), + PlainSerializer(_serialize_as_first_file_id), + ] + languages: Annotated[ + list[OcrLanguage], + Field( + serialization_alias="languages", + validation_alias=AliasChoices("languages", "language"), + min_length=1, + default_factory=lambda: ["English"], + ), + BeforeValidator(_ensure_list), + BeforeValidator(_split_comma_list), + PlainSerializer(_serialize_as_comma_separated_string), + ] + pages: Annotated[ + list[AscendingPageRange] | None, + Field(serialization_alias="pages", min_length=1, default=None), + BeforeValidator(_ensure_list), + BeforeValidator(_split_comma_list), + BeforeValidator(_int_to_string), + PlainSerializer(_serialize_page_ranges), + ] = None + output: Annotated[ + str | None, + Field(serialization_alias="output", min_length=1, default=None), + AfterValidator(_validate_output_prefix), + ] = None + + +class ExtractTextPayload(BaseModel): + """Adapt caller options into a pdfRest-ready extract text request payload.""" + + files: Annotated[ + list[PdfRestFile], + Field( + min_length=1, + max_length=1, + validation_alias=AliasChoices("file", "files"), + serialization_alias="id", + ), + BeforeValidator(_ensure_list), + AfterValidator( + _allowed_mime_types("application/pdf", error_msg="Must be a PDF file") + ), + PlainSerializer(_serialize_as_first_file_id), + ] + pages: Annotated[ + list[AscendingPageRange] | None, + Field(serialization_alias="pages", min_length=1, default=None), + BeforeValidator(_ensure_list), + BeforeValidator(_split_comma_list), + BeforeValidator(_int_to_string), + PlainSerializer(_serialize_page_ranges), + ] = None + full_text: Literal["off", "by_page", "document"] = "document" + preserve_line_breaks: Annotated[ + Literal["off", "on"], BeforeValidator(_bool_to_on_off) + ] = "off" + word_style: Annotated[Literal["off", "on"], BeforeValidator(_bool_to_on_off)] = ( + "off" + ) + word_coordinates: Annotated[ + Literal["off", "on"], BeforeValidator(_bool_to_on_off) + ] = "off" + output_type: Literal["json", "file"] = "json" + output: Annotated[ + str | None, + Field(serialization_alias="output", min_length=1, default=None), + AfterValidator(_validate_output_prefix), + ] = None + + +class ConvertToMarkdownPayload(BaseModel): + """Adapt caller options into a pdfRest-ready markdown conversion payload.""" + + files: Annotated[ + list[PdfRestFile], + Field( + min_length=1, + max_length=1, + validation_alias=AliasChoices("file", "files"), + serialization_alias="id", + ), + BeforeValidator(_ensure_list), + AfterValidator( + _allowed_mime_types("application/pdf", error_msg="Must be a PDF file") + ), + PlainSerializer(_serialize_as_first_file_id), + ] + pages: Annotated[ + list[AscendingPageRange] | None, + Field(serialization_alias="pages", min_length=1, default=None), + BeforeValidator(_ensure_list), + BeforeValidator(_split_comma_list), + BeforeValidator(_int_to_string), + PlainSerializer(_serialize_page_ranges), + ] = None + output_type: Annotated[ + SummaryOutputType, Field(serialization_alias="output_type", default="json") + ] = "json" + page_break_comments: Annotated[ + Literal["on", "off"] | None, + Field(serialization_alias="page_break_comments", default=None), + BeforeValidator(_bool_to_on_off), + ] = None + output: Annotated[ + str | None, + Field(serialization_alias="output", min_length=1, default=None), + AfterValidator(_validate_output_prefix), + ] = None + + +class TranslatePdfTextPayload(BaseModel): + """Adapt caller options into a pdfRest-ready translate request payload.""" + + files: Annotated[ + list[PdfRestFile], + Field( + min_length=1, + max_length=1, + validation_alias=AliasChoices("file", "files"), + serialization_alias="id", + ), + BeforeValidator(_ensure_list), + AfterValidator( + _allowed_mime_types( + "application/pdf", + "text/markdown", + "text/plain", + error_msg="Must be a PDF, Markdown, or plain text file", + ) + ), + PlainSerializer(_serialize_as_first_file_id), + ] + output_language: Annotated[ + str, + Field(serialization_alias="output_language"), + AfterValidator(_validate_output_language), + ] + pages: Annotated[ + list[AscendingPageRange] | None, + Field(serialization_alias="pages", min_length=1, default=None), + BeforeValidator(_ensure_list), + BeforeValidator(_split_comma_list), + BeforeValidator(_int_to_string), + PlainSerializer(_serialize_page_ranges), + ] = None + output_format: Annotated[ + TranslateOutputFormat, + Field(serialization_alias="output_format", default="markdown"), + ] = "markdown" + output_type: Annotated[ + Literal["json", "file"], + Field(serialization_alias="output_type", default="json"), + ] = "json" + output: Annotated[ + str | None, + Field(serialization_alias="output", min_length=1, default=None), + AfterValidator(_validate_output_prefix), + ] = None + + +class ExtractImagesPayload(BaseModel): + """Adapt caller options into a pdfRest-ready extract images request payload.""" + + files: Annotated[ + list[PdfRestFile], + Field( + min_length=1, + max_length=1, + validation_alias=AliasChoices("file", "files"), + serialization_alias="id", + ), + BeforeValidator(_ensure_list), + AfterValidator( + _allowed_mime_types("application/pdf", error_msg="Must be a PDF file") + ), + PlainSerializer(_serialize_as_first_file_id), + ] + pages: Annotated[ + list[AscendingPageRange] | None, + Field(serialization_alias="pages", min_length=1, default=None), + BeforeValidator(_ensure_list), + BeforeValidator(_split_comma_list), + BeforeValidator(_int_to_string), + PlainSerializer(_serialize_page_ranges), + ] = None + output: Annotated[ + str | None, + Field(serialization_alias="output", min_length=1, default=None), + AfterValidator(_validate_output_prefix), + ] = None + + RgbChannel = Annotated[int, Field(ge=0, le=255)] @@ -519,6 +833,87 @@ class PdfToWordPayload(BaseModel): ] = None +class PdfToExcelPayload(BaseModel): + """Adapt caller options into a pdfRest-ready Excel request payload.""" + + files: Annotated[ + list[PdfRestFile], + Field( + min_length=1, + max_length=1, + validation_alias=AliasChoices("file", "files"), + serialization_alias="id", + ), + BeforeValidator(_ensure_list), + AfterValidator( + _allowed_mime_types("application/pdf", error_msg="Must be a PDF file") + ), + PlainSerializer(_serialize_as_first_file_id), + ] + output: Annotated[ + str | None, + Field(serialization_alias="output", min_length=1, default=None), + AfterValidator(_validate_output_prefix), + ] = None + + +class PdfToPowerpointPayload(BaseModel): + """Adapt caller options into a pdfRest-ready PowerPoint request payload.""" + + files: Annotated[ + list[PdfRestFile], + Field( + min_length=1, + max_length=1, + validation_alias=AliasChoices("file", "files"), + serialization_alias="id", + ), + BeforeValidator(_ensure_list), + AfterValidator( + _allowed_mime_types("application/pdf", error_msg="Must be a PDF file") + ), + PlainSerializer(_serialize_as_first_file_id), + ] + output: Annotated[ + str | None, + Field(serialization_alias="output", min_length=1, default=None), + AfterValidator(_validate_output_prefix), + ] = None + + +class PdfToPdfaPayload(BaseModel): + """Adapt caller options into a pdfRest-ready PDF/A request payload.""" + + files: Annotated[ + list[PdfRestFile], + Field( + min_length=1, + max_length=1, + validation_alias=AliasChoices("file", "files"), + serialization_alias="id", + ), + BeforeValidator(_ensure_list), + AfterValidator( + _allowed_mime_types("application/pdf", error_msg="Must be a PDF file") + ), + PlainSerializer(_serialize_as_first_file_id), + ] + output_type: Annotated[PdfAType, Field(serialization_alias="output_type")] + output: Annotated[ + str | None, + Field(serialization_alias="output", min_length=1, default=None), + AfterValidator(_validate_output_prefix), + ] = None + rasterize_if_errors_encountered: Annotated[ + Literal["on", "off"] | None, + Field( + serialization_alias="rasterize_if_errors_encountered", + default=None, + ), + BeforeValidator(_bool_to_on_off), + ] = None + + class PdfToPdfxPayload(BaseModel): """Adapt caller options into a pdfRest-ready PDF/X request payload.""" @@ -626,6 +1021,167 @@ def _validate_profile_dependency(self) -> PdfCompressPayload: return self +class PdfXfaToAcroformsPayload(BaseModel): + """Adapt caller options into a pdfRest-ready XFA-to-AcroForms request payload.""" + + files: Annotated[ + list[PdfRestFile], + Field( + min_length=1, + max_length=1, + validation_alias=AliasChoices("file", "files"), + serialization_alias="id", + ), + BeforeValidator(_ensure_list), + AfterValidator( + _allowed_mime_types("application/pdf", error_msg="Must be a PDF file") + ), + PlainSerializer(_serialize_as_first_file_id), + ] + output: Annotated[ + str | None, + Field(serialization_alias="output", min_length=1, default=None), + AfterValidator(_validate_output_prefix), + ] = None + + +class PdfLinearizePayload(BaseModel): + """Adapt caller options into a pdfRest-ready linearize PDF request payload.""" + + files: Annotated[ + list[PdfRestFile], + Field( + min_length=1, + max_length=1, + validation_alias=AliasChoices("file", "files"), + serialization_alias="id", + ), + BeforeValidator(_ensure_list), + AfterValidator( + _allowed_mime_types("application/pdf", error_msg="Must be a PDF file") + ), + PlainSerializer(_serialize_as_first_file_id), + ] + output: Annotated[ + str | None, + Field(serialization_alias="output", min_length=1, default=None), + AfterValidator(_validate_output_prefix), + ] = None + + +class PdfRasterizePayload(BaseModel): + """Adapt caller options into a pdfRest-ready rasterize PDF request payload.""" + + files: Annotated[ + list[PdfRestFile], + Field( + min_length=1, + max_length=1, + validation_alias=AliasChoices("file", "files"), + serialization_alias="id", + ), + BeforeValidator(_ensure_list), + AfterValidator( + _allowed_mime_types("application/pdf", error_msg="Must be a PDF file") + ), + PlainSerializer(_serialize_as_first_file_id), + ] + output: Annotated[ + str | None, + Field(serialization_alias="output", min_length=1, default=None), + AfterValidator(_validate_output_prefix), + ] = None + + +class PdfFlattenTransparenciesPayload(BaseModel): + """Adapt caller options into a pdfRest-ready flatten-transparencies request payload.""" + + files: Annotated[ + list[PdfRestFile], + Field( + min_length=1, + max_length=1, + validation_alias=AliasChoices("file", "files"), + serialization_alias="id", + ), + BeforeValidator(_ensure_list), + AfterValidator( + _allowed_mime_types("application/pdf", error_msg="Must be a PDF file") + ), + PlainSerializer(_serialize_as_first_file_id), + ] + output: Annotated[ + str | None, + Field(serialization_alias="output", min_length=1, default=None), + AfterValidator(_validate_output_prefix), + ] = None + quality: Literal["low", "medium", "high"] = "medium" + + +class PdfFlattenAnnotationsPayload(BaseModel): + """Adapt caller options into a pdfRest-ready flatten-annotations request payload.""" + + files: Annotated[ + list[PdfRestFile], + Field( + min_length=1, + max_length=1, + validation_alias=AliasChoices("file", "files"), + serialization_alias="id", + ), + BeforeValidator(_ensure_list), + AfterValidator( + _allowed_mime_types("application/pdf", error_msg="Must be a PDF file") + ), + PlainSerializer(_serialize_as_first_file_id), + ] + output: Annotated[ + str | None, + Field(serialization_alias="output", min_length=1, default=None), + AfterValidator(_validate_output_prefix), + ] = None + + +class PdfAddAttachmentPayload(BaseModel): + """Adapt caller options into a pdfRest-ready add-attachment request payload.""" + + files: Annotated[ + list[PdfRestFile], + Field( + min_length=1, + max_length=1, + validation_alias=AliasChoices("file", "files"), + serialization_alias="id", + ), + BeforeValidator(_ensure_list), + AfterValidator( + _allowed_mime_types("application/pdf", error_msg="Must be a PDF file") + ), + PlainSerializer(_serialize_as_first_file_id), + ] + attachments: Annotated[ + list[PdfRestFile], + Field( + min_length=1, + max_length=1, + validation_alias=AliasChoices( + "attachment", + "attachments", + "file_to_attach", + "files_to_attach", + ), + serialization_alias="id_to_attach", + ), + BeforeValidator(_ensure_list), + PlainSerializer(_serialize_as_first_file_id), + ] + output: Annotated[ + str | None, + Field(serialization_alias="output", min_length=1, default=None), + AfterValidator(_validate_output_prefix), + ] = None + + class BmpPdfRestPayload(BasePdfRestGraphicPayload[Literal["rgb", "gray"]]): """Adapt caller options into a pdfRest-ready BMP request payload.""" diff --git a/src/pdfrest/models/public.py b/src/pdfrest/models/public.py index 3de11476..e4dc8a3a 100644 --- a/src/pdfrest/models/public.py +++ b/src/pdfrest/models/public.py @@ -26,6 +26,9 @@ "PdfRestFileBasedResponse", "PdfRestFileID", "PdfRestInfoResponse", + "SummarizePdfTextResponse", + "TranslatePdfTextFileResponse", + "TranslatePdfTextResponse", "UpResponse", ) @@ -312,6 +315,93 @@ class PdfRestDeletionResponse(BaseModel): ] +class SummarizePdfTextResponse(BaseModel): + """Response returned by the summarize-pdf-text tool.""" + + model_config = ConfigDict(extra="allow") + + summary: Annotated[ + str | None, + Field( + description="Summary content", + default=None, + ), + ] = None + input_id: Annotated[ + PdfRestFileID, + Field( + validation_alias=AliasChoices("input_id", "inputId"), + description="The id of the input file.", + ), + ] + + +class TranslatePdfTextResponse(BaseModel): + """Response returned by the translated-pdf-text tool.""" + + model_config = ConfigDict(extra="allow") + + source_languages: Annotated[ + list[str] | None, + Field( + alias="source_languages", + validation_alias=AliasChoices("source_languages", "sourceLanguages"), + description="Languages detected in the source content.", + default=None, + ), + ] = None + output_language: Annotated[ + str | None, + Field( + alias="output_language", + validation_alias=AliasChoices("output_language", "outputLanguage"), + description="Target language used for the translation.", + default=None, + ), + ] = None + translated_text: Annotated[ + str | None, + Field( + alias="translated_text", + validation_alias=AliasChoices("translated_text", "translatedText"), + description="Inline translation content when output_type is json.", + default=None, + ), + ] = None + input_id: Annotated[ + PdfRestFileID, + Field( + validation_alias=AliasChoices("input_id", "inputId"), + description="The id of the input file.", + ), + ] + + +class TranslatePdfTextFileResponse(PdfRestFileBasedResponse): + """File-based response returned by the translated-pdf-text tool.""" + + model_config = ConfigDict(extra="allow") + + source_languages: Annotated[ + list[str] | None, + Field( + alias="source_languages", + validation_alias=AliasChoices("source_languages", "sourceLanguages"), + description="Languages detected in the source content.", + default=None, + ), + ] = None + output_language: Annotated[ + str | None, + Field( + alias="output_language", + validation_alias=AliasChoices("output_language", "outputLanguage"), + description="Target language used for the translation.", + default=None, + ), + ] = None + + class PdfRestInfoResponse(BaseModel): """A response containing the output from the /info route.""" diff --git a/src/pdfrest/types/__init__.py b/src/pdfrest/types/__init__.py index 9bc36a87..48f78b03 100644 --- a/src/pdfrest/types/__init__.py +++ b/src/pdfrest/types/__init__.py @@ -1,7 +1,17 @@ """Public import surface for shared pdfrest types.""" from .public import ( + ALL_OCR_LANGUAGES, ALL_PDF_INFO_QUERIES, + BmpColorModel, + CompressionLevel, + ExtractTextGranularity, + FlattenQuality, + GifColorModel, + GraphicSmoothing, + JpegColorModel, + OcrLanguage, + PdfAType, PdfInfoQuery, PdfMergeInput, PdfMergeSource, @@ -11,10 +21,26 @@ PdfRedactionType, PdfRGBColor, PdfXType, + PngColorModel, + SummaryFormat, + SummaryOutputFormat, + SummaryOutputType, + TiffColorModel, + TranslateOutputFormat, ) __all__ = [ + "ALL_OCR_LANGUAGES", "ALL_PDF_INFO_QUERIES", + "BmpColorModel", + "CompressionLevel", + "ExtractTextGranularity", + "FlattenQuality", + "GifColorModel", + "GraphicSmoothing", + "JpegColorModel", + "OcrLanguage", + "PdfAType", "PdfInfoQuery", "PdfMergeInput", "PdfMergeSource", @@ -24,4 +50,10 @@ "PdfRedactionPreset", "PdfRedactionType", "PdfXType", + "PngColorModel", + "SummaryFormat", + "SummaryOutputFormat", + "SummaryOutputType", + "TiffColorModel", + "TranslateOutputFormat", ] diff --git a/src/pdfrest/types/public.py b/src/pdfrest/types/public.py index 1df53284..6472f2e7 100644 --- a/src/pdfrest/types/public.py +++ b/src/pdfrest/types/public.py @@ -13,7 +13,17 @@ PdfRestFile = Any __all__ = ( + "ALL_OCR_LANGUAGES", "ALL_PDF_INFO_QUERIES", + "BmpColorModel", + "CompressionLevel", + "ExtractTextGranularity", + "FlattenQuality", + "GifColorModel", + "GraphicSmoothing", + "JpegColorModel", + "OcrLanguage", + "PdfAType", "PdfInfoQuery", "PdfMergeInput", "PdfMergeSource", @@ -23,6 +33,12 @@ "PdfRedactionPreset", "PdfRedactionType", "PdfXType", + "PngColorModel", + "SummaryFormat", + "SummaryOutputFormat", + "SummaryOutputType", + "TiffColorModel", + "TranslateOutputFormat", ) PdfInfoQuery = Literal[ @@ -98,4 +114,49 @@ class PdfMergeSource(TypedDict, total=False): PdfMergeInput = PdfRestFile | PdfMergeSource | tuple[PdfRestFile, PdfPageSelection] +PdfAType = Literal["PDF/A-1b", "PDF/A-2b", "PDF/A-2u", "PDF/A-3b", "PDF/A-3u"] PdfXType = Literal["PDF/X-1a", "PDF/X-3", "PDF/X-4", "PDF/X-6"] +ExtractTextGranularity = Literal["off", "by_page", "document"] +CompressionLevel = Literal["low", "medium", "high", "custom"] +FlattenQuality = Literal["low", "medium", "high"] +PngColorModel = Literal["rgb", "rgba", "gray"] +BmpColorModel = Literal["rgb", "gray"] +GifColorModel = Literal["rgb", "gray"] +JpegColorModel = Literal["rgb", "cmyk", "gray"] +TiffColorModel = Literal["rgb", "rgba", "cmyk", "lab", "gray"] +GraphicSmoothing = Literal["none", "all", "text", "line", "image"] + +SummaryFormat = Literal[ + "overview", + "highlight", + "abstract", + "bullet_points", + "numbered_list", + "table_of_contents", + "outline", + "question_answer", + "action_items", +] + +SummaryOutputFormat = Literal["plaintext", "markdown"] +SummaryOutputType = Literal["json", "file"] + +TranslateOutputFormat = Literal["plaintext", "markdown"] + +OcrLanguage = Literal[ + "ChineseSimplified", + "ChineseTraditional", + "Dutch", + "English", + "French", + "German", + "Italian", + "Japanese", + "Korean", + "Portuguese", + "Spanish", +] + +ALL_OCR_LANGUAGES: tuple[OcrLanguage, ...] = cast( + tuple[OcrLanguage, ...], get_args(OcrLanguage) +) diff --git a/tests/live/test_live_add_attachment_to_pdf.py b/tests/live/test_live_add_attachment_to_pdf.py new file mode 100644 index 00000000..cf5b1aff --- /dev/null +++ b/tests/live/test_live_add_attachment_to_pdf.py @@ -0,0 +1,130 @@ +from __future__ import annotations + +import pytest + +from pdfrest import AsyncPdfRestClient, PdfRestApiError, PdfRestClient +from pdfrest.models import PdfRestFile + +from ..resources import get_test_resource_path + + +@pytest.fixture(scope="module") +def uploaded_pdf_for_attachment( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> PdfRestFile: + resource = get_test_resource_path("report.pdf") + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + return client.files.create_from_paths([resource])[0] + + +@pytest.fixture(scope="module") +def uploaded_attachment_file( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> PdfRestFile: + resource = get_test_resource_path("report.docx") + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + return client.files.create_from_paths([resource])[0] + + +def test_live_add_attachment_to_pdf_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_attachment: PdfRestFile, + uploaded_attachment_file: PdfRestFile, +) -> None: + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + response = client.add_attachment_to_pdf( + uploaded_pdf_for_attachment, + attachment=uploaded_attachment_file, + output="with-attachment", + ) + + assert response.output_files + output_file = response.output_file + assert output_file.name.startswith("with-attachment") + assert output_file.type == "application/pdf" + assert output_file.size > 0 + assert response.warning is None + assert [str(file_id) for file_id in response.input_ids] == [ + str(uploaded_pdf_for_attachment.id), + str(uploaded_attachment_file.id), + ] + + +@pytest.mark.asyncio +async def test_live_async_add_attachment_to_pdf_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_attachment: PdfRestFile, + uploaded_attachment_file: PdfRestFile, +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + response = await client.add_attachment_to_pdf( + uploaded_pdf_for_attachment, + attachment=uploaded_attachment_file, + output="async-attachment", + ) + + assert response.output_files + output_file = response.output_file + assert output_file.name.startswith("async-attachment") + assert output_file.type == "application/pdf" + assert output_file.size > 0 + assert response.warning is None + assert [str(file_id) for file_id in response.input_ids] == [ + str(uploaded_pdf_for_attachment.id), + str(uploaded_attachment_file.id), + ] + + +def test_live_add_attachment_to_pdf_invalid_file_id( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_attachment: PdfRestFile, + uploaded_attachment_file: PdfRestFile, +) -> None: + with ( + PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client, + pytest.raises(PdfRestApiError, match=r"(?i)(id|file)"), + ): + client.add_attachment_to_pdf( + uploaded_pdf_for_attachment, + attachment=uploaded_attachment_file, + extra_body={"id": "00000000-0000-0000-0000-000000000000"}, + ) + + +@pytest.mark.asyncio +async def test_live_async_add_attachment_to_pdf_invalid_attachment_id( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_attachment: PdfRestFile, + uploaded_attachment_file: PdfRestFile, +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + with pytest.raises(PdfRestApiError, match=r"(?i)(id|file)"): + await client.add_attachment_to_pdf( + uploaded_pdf_for_attachment, + attachment=uploaded_attachment_file, + extra_body={"id_to_attach": "ffffffff-ffff-ffff-ffff-ffffffffffff"}, + ) diff --git a/tests/live/test_live_compress_pdf.py b/tests/live/test_live_compress_pdf.py index 6ee8b365..0b3cdf66 100644 --- a/tests/live/test_live_compress_pdf.py +++ b/tests/live/test_live_compress_pdf.py @@ -158,7 +158,7 @@ def test_live_compress_pdf_invalid_level( api_key=pdfrest_api_key, base_url=pdfrest_live_base_url, ) as client, - pytest.raises(PdfRestApiError), + pytest.raises(PdfRestApiError, match=r"(?i)compression"), ): client.compress_pdf( uploaded_pdf_for_compression, @@ -177,7 +177,7 @@ async def test_live_async_compress_pdf_invalid_level( api_key=pdfrest_api_key, base_url=pdfrest_live_base_url, ) as client: - with pytest.raises(PdfRestApiError): + with pytest.raises(PdfRestApiError, match=r"(?i)compression"): await client.compress_pdf( uploaded_pdf_for_compression, compression_level="low", diff --git a/tests/live/test_live_convert_to_excel.py b/tests/live/test_live_convert_to_excel.py new file mode 100644 index 00000000..f592aa40 --- /dev/null +++ b/tests/live/test_live_convert_to_excel.py @@ -0,0 +1,118 @@ +from __future__ import annotations + +import pytest + +from pdfrest import AsyncPdfRestClient, PdfRestApiError, PdfRestClient +from pdfrest.models import PdfRestFile + +from ..resources import get_test_resource_path + + +@pytest.fixture(scope="module") +def uploaded_pdf_for_excel( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> PdfRestFile: + resource = get_test_resource_path("report.pdf") + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + return client.files.create_from_paths([resource])[0] + + +@pytest.mark.parametrize( + "output_name", + [ + pytest.param(None, id="default-output"), + pytest.param("live-excel", id="custom-output"), + ], +) +def test_live_convert_to_excel_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_excel: PdfRestFile, + output_name: str | None, +) -> None: + kwargs: dict[str, str] = {} + if output_name is not None: + kwargs["output"] = output_name + + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + response = client.convert_to_excel(uploaded_pdf_for_excel, **kwargs) + + assert response.output_files + output_file = response.output_file + assert ( + output_file.type + == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" + ) + assert output_file.size > 0 + assert response.warning is None + assert str(response.input_id) == str(uploaded_pdf_for_excel.id) + if output_name is not None: + assert output_file.name.startswith(output_name) + else: + assert output_file.name.endswith(".xlsx") + + +@pytest.mark.asyncio +async def test_live_async_convert_to_excel_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_excel: PdfRestFile, +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + response = await client.convert_to_excel(uploaded_pdf_for_excel, output="async") + + assert response.output_files + output_file = response.output_file + assert output_file.name.startswith("async") + assert ( + output_file.type + == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" + ) + assert output_file.size > 0 + assert response.warning is None + assert str(response.input_id) == str(uploaded_pdf_for_excel.id) + + +def test_live_convert_to_excel_invalid_file_id( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_excel: PdfRestFile, +) -> None: + with ( + PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client, + pytest.raises(PdfRestApiError, match=r"(?i)(id|file)"), + ): + client.convert_to_excel( + uploaded_pdf_for_excel, + extra_body={"id": "00000000-0000-0000-0000-000000000000"}, + ) + + +@pytest.mark.asyncio +async def test_live_async_convert_to_excel_invalid_file_id( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_excel: PdfRestFile, +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + with pytest.raises(PdfRestApiError, match=r"(?i)(id|file)"): + await client.convert_to_excel( + uploaded_pdf_for_excel, + extra_body={"id": "ffffffff-ffff-ffff-ffff-ffffffffffff"}, + ) diff --git a/tests/live/test_live_convert_to_markdown.py b/tests/live/test_live_convert_to_markdown.py new file mode 100644 index 00000000..760e1798 --- /dev/null +++ b/tests/live/test_live_convert_to_markdown.py @@ -0,0 +1,88 @@ +from __future__ import annotations + +import pytest + +from pdfrest import AsyncPdfRestClient, PdfRestApiError, PdfRestClient +from pdfrest.models import PdfRestFileBasedResponse + +from ..resources import get_test_resource_path + + +def test_live_convert_to_markdown_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = client.files.create_from_paths([resource])[0] + response = client.convert_to_markdown(uploaded) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_files + output_file = response.output_file + assert output_file.name.endswith(".md") + assert output_file.type == "text/markdown" + assert output_file.size > 0 + assert response.warning is None + assert response.input_id == uploaded.id + + +@pytest.mark.asyncio +async def test_live_async_convert_to_markdown_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = (await client.files.create_from_paths([resource]))[0] + response = await client.convert_to_markdown(uploaded, output="async-md") + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_files + output_file = response.output_file + assert output_file.name.startswith("async-md") + assert output_file.type == "text/markdown" + assert output_file.size > 0 + assert response.warning is None + assert response.input_id == uploaded.id + + +def test_live_convert_to_markdown_invalid_pages( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = client.files.create_from_paths([resource])[0] + with pytest.raises(PdfRestApiError, match=r"(?i)page"): + client.convert_to_markdown( + uploaded, + extra_body={"pages": "last-1"}, + ) + + +@pytest.mark.asyncio +async def test_live_async_convert_to_markdown_invalid_pages( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = (await client.files.create_from_paths([resource]))[0] + with pytest.raises(PdfRestApiError, match=r"(?i)page"): + await client.convert_to_markdown( + uploaded, + extra_body={"pages": "last-1"}, + ) diff --git a/tests/live/test_live_convert_to_pdfa.py b/tests/live/test_live_convert_to_pdfa.py new file mode 100644 index 00000000..5d39d009 --- /dev/null +++ b/tests/live/test_live_convert_to_pdfa.py @@ -0,0 +1,147 @@ +from __future__ import annotations + +from typing import cast, get_args + +import pytest + +from pdfrest import AsyncPdfRestClient, PdfRestApiError, PdfRestClient +from pdfrest.models import PdfRestFile +from pdfrest.types import PdfAType + +from ..resources import get_test_resource_path + +PDFA_TYPES: tuple[PdfAType, ...] = cast(tuple[PdfAType, ...], get_args(PdfAType)) +PDFA_TYPE_PARAMS = [ + pytest.param(output_type, id=output_type) for output_type in PDFA_TYPES +] + + +@pytest.fixture(scope="module") +def uploaded_pdf_for_pdfa( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> PdfRestFile: + resource = get_test_resource_path("report.pdf") + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + return client.files.create_from_paths([resource])[0] + + +@pytest.mark.parametrize("output_type", PDFA_TYPE_PARAMS) +def test_live_convert_to_pdfa_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_pdfa: PdfRestFile, + output_type: PdfAType, +) -> None: + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + response = client.convert_to_pdfa( + uploaded_pdf_for_pdfa, + output_type=output_type, + output="pdfa-live", + ) + + assert response.output_files + output_file = response.output_file + assert output_file.type == "application/pdf" + assert str(response.input_id) == str(uploaded_pdf_for_pdfa.id) + assert output_file.name.startswith("pdfa-live") + + +@pytest.mark.asyncio +@pytest.mark.parametrize("output_type", PDFA_TYPE_PARAMS) +async def test_live_async_convert_to_pdfa_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_pdfa: PdfRestFile, + output_type: PdfAType, +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + response = await client.convert_to_pdfa( + uploaded_pdf_for_pdfa, + output_type=output_type, + output="async-pdfa", + ) + + assert response.output_files + output_file = response.output_file + assert output_file.name.startswith("async-pdfa") + assert output_file.type == "application/pdf" + assert str(response.input_id) == str(uploaded_pdf_for_pdfa.id) + + +def test_live_convert_to_pdfa_with_rasterize_option( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_pdfa: PdfRestFile, +) -> None: + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + response = client.convert_to_pdfa( + uploaded_pdf_for_pdfa, + output_type="PDF/A-2b", + rasterize_if_errors_encountered="on", + output="pdfa-rasterize", + ) + + assert response.output_files + output_file = response.output_file + assert output_file.name.startswith("pdfa-rasterize") + assert output_file.type == "application/pdf" + assert str(response.input_id) == str(uploaded_pdf_for_pdfa.id) + + +@pytest.mark.parametrize( + "invalid_output_type", + [ + pytest.param("PDF/A-0", id="pdfa-0"), + pytest.param("PDF/A-99", id="pdfa-99"), + pytest.param("pdf/a-2b", id="lowercase"), + ], +) +def test_live_convert_to_pdfa_invalid_output_type( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_pdfa: PdfRestFile, + invalid_output_type: str, +) -> None: + with ( + PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client, + pytest.raises(PdfRestApiError, match=r"(?i)pdf.?a"), + ): + client.convert_to_pdfa( + uploaded_pdf_for_pdfa, + output_type="PDF/A-1b", + extra_body={"output_type": invalid_output_type}, + ) + + +@pytest.mark.asyncio +async def test_live_async_convert_to_pdfa_invalid_output_type( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_pdfa: PdfRestFile, +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + with pytest.raises(PdfRestApiError, match=r"(?i)pdf.?a"): + await client.convert_to_pdfa( + uploaded_pdf_for_pdfa, + output_type="PDF/A-1b", + extra_body={"output_type": "PDF/A-0"}, + ) diff --git a/tests/live/test_live_convert_to_pdfx.py b/tests/live/test_live_convert_to_pdfx.py index a08088b0..df0e6695 100644 --- a/tests/live/test_live_convert_to_pdfx.py +++ b/tests/live/test_live_convert_to_pdfx.py @@ -4,7 +4,7 @@ import pytest -from pdfrest import PdfRestApiError, PdfRestClient +from pdfrest import AsyncPdfRestClient, PdfRestApiError, PdfRestClient from pdfrest.models import PdfRestFile from pdfrest.types import PdfXType @@ -50,6 +50,31 @@ def test_live_convert_to_pdfx_success( assert output_file.name.startswith("pdfx-live") +@pytest.mark.asyncio +@pytest.mark.parametrize("output_type", PDFX_TYPES, ids=list(PDFX_TYPES)) +async def test_live_async_convert_to_pdfx_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_pdfx: PdfRestFile, + output_type: PdfXType, +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + response = await client.convert_to_pdfx( + uploaded_pdf_for_pdfx, + output_type=output_type, + output="async-pdfx", + ) + + assert response.output_files + output_file = response.output_file + assert output_file.name.startswith("async-pdfx") + assert output_file.type == "application/pdf" + assert str(response.input_id) == str(uploaded_pdf_for_pdfx.id) + + @pytest.mark.parametrize( "invalid_output_type", [ @@ -69,10 +94,28 @@ def test_live_convert_to_pdfx_invalid_output_type( api_key=pdfrest_api_key, base_url=pdfrest_live_base_url, ) as client, - pytest.raises(PdfRestApiError), + pytest.raises(PdfRestApiError, match=r"(?i)pdf.?x"), ): client.convert_to_pdfx( uploaded_pdf_for_pdfx, output_type="PDF/X-1a", extra_body={"output_type": invalid_output_type}, ) + + +@pytest.mark.asyncio +async def test_live_async_convert_to_pdfx_invalid_output_type( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_pdfx: PdfRestFile, +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + with pytest.raises(PdfRestApiError, match=r"(?i)pdf.?x"): + await client.convert_to_pdfx( + uploaded_pdf_for_pdfx, + output_type="PDF/X-1a", + extra_body={"output_type": "PDF/X-0"}, + ) diff --git a/tests/live/test_live_convert_to_powerpoint.py b/tests/live/test_live_convert_to_powerpoint.py new file mode 100644 index 00000000..8a1209a2 --- /dev/null +++ b/tests/live/test_live_convert_to_powerpoint.py @@ -0,0 +1,120 @@ +from __future__ import annotations + +import pytest + +from pdfrest import AsyncPdfRestClient, PdfRestApiError, PdfRestClient +from pdfrest.models import PdfRestFile + +from ..resources import get_test_resource_path + + +@pytest.fixture(scope="module") +def uploaded_pdf_for_powerpoint( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> PdfRestFile: + resource = get_test_resource_path("report.pdf") + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + return client.files.create_from_paths([resource])[0] + + +@pytest.mark.parametrize( + "output_name", + [ + pytest.param(None, id="default-output"), + pytest.param("live-powerpoint", id="custom-output"), + ], +) +def test_live_convert_to_powerpoint_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_powerpoint: PdfRestFile, + output_name: str | None, +) -> None: + kwargs: dict[str, str] = {} + if output_name is not None: + kwargs["output"] = output_name + + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + response = client.convert_to_powerpoint(uploaded_pdf_for_powerpoint, **kwargs) + + assert response.output_files + output_file = response.output_file + assert ( + output_file.type + == "application/vnd.openxmlformats-officedocument.presentationml.presentation" + ) + assert output_file.size > 0 + assert response.warning is None + assert str(response.input_id) == str(uploaded_pdf_for_powerpoint.id) + if output_name is not None: + assert output_file.name.startswith(output_name) + else: + assert output_file.name.endswith(".pptx") + + +@pytest.mark.asyncio +async def test_live_async_convert_to_powerpoint_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_powerpoint: PdfRestFile, +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + response = await client.convert_to_powerpoint( + uploaded_pdf_for_powerpoint, output="async" + ) + + assert response.output_files + output_file = response.output_file + assert output_file.name.startswith("async") + assert ( + output_file.type + == "application/vnd.openxmlformats-officedocument.presentationml.presentation" + ) + assert output_file.size > 0 + assert response.warning is None + assert str(response.input_id) == str(uploaded_pdf_for_powerpoint.id) + + +def test_live_convert_to_powerpoint_invalid_file_id( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_powerpoint: PdfRestFile, +) -> None: + with ( + PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client, + pytest.raises(PdfRestApiError, match=r"(?i)(id|file)"), + ): + client.convert_to_powerpoint( + uploaded_pdf_for_powerpoint, + extra_body={"id": "00000000-0000-0000-0000-000000000000"}, + ) + + +@pytest.mark.asyncio +async def test_live_async_convert_to_powerpoint_invalid_file_id( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_powerpoint: PdfRestFile, +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + with pytest.raises(PdfRestApiError, match=r"(?i)(id|file)"): + await client.convert_to_powerpoint( + uploaded_pdf_for_powerpoint, + extra_body={"id": "ffffffff-ffff-ffff-ffff-ffffffffffff"}, + ) diff --git a/tests/live/test_live_convert_to_word.py b/tests/live/test_live_convert_to_word.py index c3c5822e..3ec6a334 100644 --- a/tests/live/test_live_convert_to_word.py +++ b/tests/live/test_live_convert_to_word.py @@ -2,7 +2,7 @@ import pytest -from pdfrest import PdfRestApiError, PdfRestClient +from pdfrest import AsyncPdfRestClient, PdfRestApiError, PdfRestClient from pdfrest.models import PdfRestFile from ..resources import get_test_resource_path @@ -57,6 +57,31 @@ def test_live_convert_to_word_success( assert output_file.name.endswith(".docx") +@pytest.mark.asyncio +async def test_live_async_convert_to_word_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_word: PdfRestFile, +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + response = await client.convert_to_word( + uploaded_pdf_for_word, + output="async-word", + ) + + assert response.output_files + output_file = response.output_file + assert output_file.name.startswith("async-word") + assert ( + output_file.type + == "application/vnd.openxmlformats-officedocument.wordprocessingml.document" + ) + assert str(response.input_id) == str(uploaded_pdf_for_word.id) + + def test_live_convert_to_word_invalid_file_id( pdfrest_api_key: str, pdfrest_live_base_url: str, @@ -67,9 +92,26 @@ def test_live_convert_to_word_invalid_file_id( api_key=pdfrest_api_key, base_url=pdfrest_live_base_url, ) as client, - pytest.raises(PdfRestApiError), + pytest.raises(PdfRestApiError, match=r"(?i)(id|file)"), ): client.convert_to_word( uploaded_pdf_for_word, extra_body={"id": "00000000-0000-0000-0000-000000000000"}, ) + + +@pytest.mark.asyncio +async def test_live_async_convert_to_word_invalid_file_id( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_word: PdfRestFile, +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + with pytest.raises(PdfRestApiError, match=r"(?i)(id|file)"): + await client.convert_to_word( + uploaded_pdf_for_word, + extra_body={"id": "ffffffff-ffff-ffff-ffff-ffffffffffff"}, + ) diff --git a/tests/live/test_live_convert_xfa_to_acroforms.py b/tests/live/test_live_convert_xfa_to_acroforms.py new file mode 100644 index 00000000..dba38304 --- /dev/null +++ b/tests/live/test_live_convert_xfa_to_acroforms.py @@ -0,0 +1,126 @@ +from __future__ import annotations + +import pytest + +from pdfrest import AsyncPdfRestClient, PdfRestApiError, PdfRestClient +from pdfrest.models import PdfRestFile + +from ..resources import get_test_resource_path + +WARNING_NO_XFA_FORMS = ( + "No XFA forms were detected in the input PDF. No output was produced." +) + + +@pytest.fixture(scope="module") +def uploaded_pdf_for_acroforms( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> PdfRestFile: + resource = get_test_resource_path("report.pdf") + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + return client.files.create_from_paths([resource])[0] + + +@pytest.mark.parametrize( + "output_name", + [ + pytest.param(None, id="default-output"), + pytest.param("live-acroforms", id="custom-output"), + ], +) +def test_live_convert_xfa_to_acroforms_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_acroforms: PdfRestFile, + output_name: str | None, +) -> None: + kwargs: dict[str, str] = {} + if output_name is not None: + kwargs["output"] = output_name + + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + response = client.convert_xfa_to_acroforms(uploaded_pdf_for_acroforms, **kwargs) + + assert str(response.input_id) == str(uploaded_pdf_for_acroforms.id) + if response.warning is not None: + assert response.warning == WARNING_NO_XFA_FORMS + assert response.output_files == [] + return + + assert response.output_files + output_file = response.output_file + assert output_file.type == "application/pdf" + assert output_file.size > 0 + if output_name is not None: + assert output_file.name.startswith(output_name) + else: + assert output_file.name.endswith(".pdf") + + +def test_live_convert_xfa_to_acroforms_invalid_file_id( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_acroforms: PdfRestFile, +) -> None: + with ( + PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client, + pytest.raises(PdfRestApiError, match=r"(?i)(id|file)"), + ): + client.convert_xfa_to_acroforms( + uploaded_pdf_for_acroforms, + extra_body={"id": "00000000-0000-0000-0000-000000000000"}, + ) + + +@pytest.mark.asyncio +async def test_live_async_convert_xfa_to_acroforms_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_acroforms: PdfRestFile, +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + response = await client.convert_xfa_to_acroforms( + uploaded_pdf_for_acroforms, output="async" + ) + + assert str(response.input_id) == str(uploaded_pdf_for_acroforms.id) + if response.warning is not None: + assert response.warning == WARNING_NO_XFA_FORMS + assert response.output_files == [] + return + + assert response.output_files + output_file = response.output_file + assert output_file.name.startswith("async") + assert output_file.type == "application/pdf" + assert output_file.size > 0 + + +@pytest.mark.asyncio +async def test_live_async_convert_xfa_to_acroforms_invalid_file_id( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_acroforms: PdfRestFile, +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + with pytest.raises(PdfRestApiError, match=r"(?i)(id|file)"): + await client.convert_xfa_to_acroforms( + uploaded_pdf_for_acroforms, + extra_body={"id": "ffffffff-ffff-ffff-ffff-ffffffffffff"}, + ) diff --git a/tests/live/test_live_delete.py b/tests/live/test_live_delete.py index 75727fef..52bdf6fd 100644 --- a/tests/live/test_live_delete.py +++ b/tests/live/test_live_delete.py @@ -57,7 +57,7 @@ def test_live_delete_files_invalid_id( base_url=pdfrest_live_base_url, ) as client: uploaded = client.files.create_from_paths([resource])[0] - with pytest.raises(ValidationError): + with pytest.raises(ValidationError, match=r"(?i)ids?"): client.files.delete(uploaded, extra_body={"ids": token_urlsafe(16)}) @@ -72,7 +72,7 @@ async def test_live_async_delete_files_invalid_id( base_url=pdfrest_live_base_url, ) as client: uploaded = (await client.files.create_from_paths([resource]))[0] - with pytest.raises(ValidationError): + with pytest.raises(ValidationError, match=r"(?i)ids?"): await client.files.delete(uploaded, extra_body={"ids": token_urlsafe(16)}) diff --git a/tests/live/test_live_extract_images.py b/tests/live/test_live_extract_images.py new file mode 100644 index 00000000..3410622a --- /dev/null +++ b/tests/live/test_live_extract_images.py @@ -0,0 +1,95 @@ +from __future__ import annotations + +import pytest + +from pdfrest import AsyncPdfRestClient, PdfRestApiError, PdfRestClient +from pdfrest.models import PdfRestFileBasedResponse + +from ..resources import get_test_resource_path + + +def test_live_extract_images_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("duckhat.pdf") + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = client.files.create_from_paths([resource])[0] + response = client.extract_images(uploaded) + + assert isinstance(response, PdfRestFileBasedResponse) + output_files = response.output_files + assert output_files + assert all(file.name for file in output_files) + assert all( + file.type and (file.type.startswith("image/") or file.type == "application/zip") + for file in output_files + ) + assert all(file.size > 0 for file in output_files) + assert response.warning is None + assert response.input_id == uploaded.id + + +@pytest.mark.asyncio +async def test_live_async_extract_images_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("duckhat.pdf") + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = (await client.files.create_from_paths([resource]))[0] + response = await client.extract_images(uploaded, output="async-images") + + assert isinstance(response, PdfRestFileBasedResponse) + output_files = response.output_files + assert output_files + assert output_files[0].name.startswith("async-images") + assert all(file.name for file in output_files) + assert all( + file.type and (file.type.startswith("image/") or file.type == "application/zip") + for file in output_files + ) + assert all(file.size > 0 for file in output_files) + assert response.warning is None + assert response.input_id == uploaded.id + + +def test_live_extract_images_invalid_pages( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("duckhat.pdf") + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = client.files.create_from_paths([resource])[0] + with pytest.raises(PdfRestApiError, match=r"(?i)page"): + client.extract_images( + uploaded, + extra_body={"pages": "last-1"}, + ) + + +@pytest.mark.asyncio +async def test_live_async_extract_images_invalid_pages( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("duckhat.pdf") + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = (await client.files.create_from_paths([resource]))[0] + with pytest.raises(PdfRestApiError, match=r"(?i)page"): + await client.extract_images( + uploaded, + extra_body={"pages": "last-1"}, + ) diff --git a/tests/live/test_live_extract_pdf_text_to_file.py b/tests/live/test_live_extract_pdf_text_to_file.py new file mode 100644 index 00000000..d6e58652 --- /dev/null +++ b/tests/live/test_live_extract_pdf_text_to_file.py @@ -0,0 +1,102 @@ +from __future__ import annotations + +import pytest + +from pdfrest import AsyncPdfRestClient, PdfRestApiError, PdfRestClient +from pdfrest.models import PdfRestFileBasedResponse + +from ..resources import get_test_resource_path + + +def test_live_extract_pdf_text_to_file_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = client.files.create_from_paths([resource])[0] + response = client.extract_pdf_text_to_file( + uploaded, + full_text="document", + preserve_line_breaks="on", + word_style="off", + word_coordinates="off", + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_files + output_file = response.output_file + assert output_file.name.endswith(".json") + assert output_file.type == "application/json" + assert output_file.size > 0 + assert response.warning is None + assert response.input_id == uploaded.id + + +@pytest.mark.asyncio +async def test_live_async_extract_pdf_text_to_file_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = (await client.files.create_from_paths([resource]))[0] + response = await client.extract_pdf_text_to_file( + uploaded, + full_text="document", + preserve_line_breaks="on", + word_style="off", + word_coordinates="off", + output="async-text", + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_files + output_file = response.output_file + assert output_file.name.startswith("async-text") + assert output_file.name.endswith(".json") + assert output_file.type == "application/json" + assert output_file.size > 0 + assert response.warning is None + assert response.input_id == uploaded.id + + +def test_live_extract_pdf_text_to_file_invalid_pages( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = client.files.create_from_paths([resource])[0] + with pytest.raises(PdfRestApiError, match=r"(?i)page"): + client.extract_pdf_text_to_file( + uploaded, + extra_body={"pages": "last-1"}, + ) + + +@pytest.mark.asyncio +async def test_live_async_extract_pdf_text_to_file_invalid_pages( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = (await client.files.create_from_paths([resource]))[0] + with pytest.raises(PdfRestApiError, match=r"(?i)page"): + await client.extract_pdf_text_to_file( + uploaded, + extra_body={"pages": "last-1"}, + ) diff --git a/tests/live/test_live_flatten_annotations.py b/tests/live/test_live_flatten_annotations.py new file mode 100644 index 00000000..b97b08b0 --- /dev/null +++ b/tests/live/test_live_flatten_annotations.py @@ -0,0 +1,114 @@ +from __future__ import annotations + +import pytest + +from pdfrest import AsyncPdfRestClient, PdfRestApiError, PdfRestClient +from pdfrest.models import PdfRestFile + +from ..resources import get_test_resource_path + + +@pytest.fixture(scope="module") +def uploaded_pdf_for_annotations( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> PdfRestFile: + resource = get_test_resource_path("report.pdf") + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + return client.files.create_from_paths([resource])[0] + + +@pytest.mark.parametrize( + "output_name", + [ + pytest.param(None, id="default-output"), + pytest.param("flatten-annotations", id="custom-output"), + ], +) +def test_live_flatten_annotations_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_annotations: PdfRestFile, + output_name: str | None, +) -> None: + kwargs: dict[str, str] = {} + if output_name is not None: + kwargs["output"] = output_name + + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + response = client.flatten_annotations(uploaded_pdf_for_annotations, **kwargs) + + assert response.output_files + output_file = response.output_file + assert output_file.type == "application/pdf" + assert output_file.size > 0 + assert response.warning is None + assert str(response.input_id) == str(uploaded_pdf_for_annotations.id) + if output_name is not None: + assert output_file.name.startswith(output_name) + else: + assert output_file.name.endswith(".pdf") + + +@pytest.mark.asyncio +async def test_live_async_flatten_annotations_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_annotations: PdfRestFile, +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + response = await client.flatten_annotations( + uploaded_pdf_for_annotations, output="async" + ) + + assert response.output_files + output_file = response.output_file + assert output_file.name.startswith("async") + assert output_file.type == "application/pdf" + assert output_file.size > 0 + assert response.warning is None + assert str(response.input_id) == str(uploaded_pdf_for_annotations.id) + + +def test_live_flatten_annotations_invalid_file_id( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_annotations: PdfRestFile, +) -> None: + with ( + PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client, + pytest.raises(PdfRestApiError, match=r"(?i)(id|file)"), + ): + client.flatten_annotations( + uploaded_pdf_for_annotations, + extra_body={"id": "00000000-0000-0000-0000-000000000000"}, + ) + + +@pytest.mark.asyncio +async def test_live_async_flatten_annotations_invalid_file_id( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_annotations: PdfRestFile, +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + with pytest.raises(PdfRestApiError, match=r"(?i)(id|file)"): + await client.flatten_annotations( + uploaded_pdf_for_annotations, + extra_body={"id": "ffffffff-ffff-ffff-ffff-ffffffffffff"}, + ) diff --git a/tests/live/test_live_flatten_pdf_forms.py b/tests/live/test_live_flatten_pdf_forms.py index c6ad7fdb..5bff7304 100644 --- a/tests/live/test_live_flatten_pdf_forms.py +++ b/tests/live/test_live_flatten_pdf_forms.py @@ -2,7 +2,7 @@ import pytest -from pdfrest import PdfRestApiError, PdfRestClient +from pdfrest import AsyncPdfRestClient, PdfRestApiError, PdfRestClient from pdfrest.models import PdfRestFile from ..resources import get_test_resource_path @@ -54,6 +54,28 @@ def test_live_flatten_pdf_forms( assert output_file.name.endswith(".pdf") +@pytest.mark.asyncio +async def test_live_async_flatten_pdf_forms_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_with_forms: PdfRestFile, +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + response = await client.flatten_pdf_forms( + uploaded_pdf_with_forms, + output="async-flattened", + ) + + assert response.output_files + output_file = response.output_file + assert output_file.name.startswith("async-flattened") + assert output_file.type == "application/pdf" + assert str(response.input_id) == str(uploaded_pdf_with_forms.id) + + def test_live_flatten_pdf_forms_invalid_file_id( pdfrest_api_key: str, pdfrest_live_base_url: str, @@ -64,9 +86,26 @@ def test_live_flatten_pdf_forms_invalid_file_id( api_key=pdfrest_api_key, base_url=pdfrest_live_base_url, ) as client, - pytest.raises(PdfRestApiError), + pytest.raises(PdfRestApiError, match=r"(?i)(id|file)"), ): client.flatten_pdf_forms( uploaded_pdf_with_forms, extra_body={"id": "ffffffff-ffff-ffff-ffff-ffffffffffff"}, ) + + +@pytest.mark.asyncio +async def test_live_async_flatten_pdf_forms_invalid_file_id( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_with_forms: PdfRestFile, +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + with pytest.raises(PdfRestApiError, match=r"(?i)(id|file)"): + await client.flatten_pdf_forms( + uploaded_pdf_with_forms, + extra_body={"id": "00000000-0000-0000-0000-000000000000"}, + ) diff --git a/tests/live/test_live_flatten_transparencies.py b/tests/live/test_live_flatten_transparencies.py new file mode 100644 index 00000000..7da1eb40 --- /dev/null +++ b/tests/live/test_live_flatten_transparencies.py @@ -0,0 +1,117 @@ +from __future__ import annotations + +import pytest + +from pdfrest import AsyncPdfRestClient, PdfRestApiError, PdfRestClient +from pdfrest.models import PdfRestFile + +from ..resources import get_test_resource_path + + +@pytest.fixture(scope="module") +def uploaded_pdf_for_transparencies( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> PdfRestFile: + resource = get_test_resource_path("report.pdf") + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + return client.files.create_from_paths([resource])[0] + + +@pytest.mark.parametrize( + ("output_name", "quality"), + [ + pytest.param(None, "medium", id="default-output"), + pytest.param("flatten-transparency", "high", id="custom-output-high"), + ], +) +def test_live_flatten_transparencies_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_transparencies: PdfRestFile, + output_name: str | None, + quality: str, +) -> None: + kwargs: dict[str, str] = {"quality": quality} + if output_name is not None: + kwargs["output"] = output_name + + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + response = client.flatten_transparencies( + uploaded_pdf_for_transparencies, **kwargs + ) + + assert response.output_files + output_file = response.output_file + assert output_file.type == "application/pdf" + assert output_file.size > 0 + assert response.warning is None + assert str(response.input_id) == str(uploaded_pdf_for_transparencies.id) + if output_name is not None: + assert output_file.name.startswith(output_name) + else: + assert output_file.name.endswith(".pdf") + + +@pytest.mark.asyncio +async def test_live_async_flatten_transparencies_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_transparencies: PdfRestFile, +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + response = await client.flatten_transparencies( + uploaded_pdf_for_transparencies, output="async", quality="low" + ) + + assert response.output_files + output_file = response.output_file + assert output_file.name.startswith("async") + assert output_file.type == "application/pdf" + assert output_file.size > 0 + assert response.warning is None + assert str(response.input_id) == str(uploaded_pdf_for_transparencies.id) + + +def test_live_flatten_transparencies_invalid_file_id( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_transparencies: PdfRestFile, +) -> None: + with ( + PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client, + pytest.raises(PdfRestApiError, match=r"(?i)(id|file)"), + ): + client.flatten_transparencies( + uploaded_pdf_for_transparencies, + extra_body={"id": "00000000-0000-0000-0000-000000000000"}, + ) + + +@pytest.mark.asyncio +async def test_live_async_flatten_transparencies_invalid_file_id( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_transparencies: PdfRestFile, +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + with pytest.raises(PdfRestApiError, match=r"(?i)(id|file)"): + await client.flatten_transparencies( + uploaded_pdf_for_transparencies, + extra_body={"id": "ffffffff-ffff-ffff-ffff-ffffffffffff"}, + ) diff --git a/tests/live/test_live_graphic_conversions.py b/tests/live/test_live_graphic_conversions.py index 2b68edb3..a78f8d0f 100644 --- a/tests/live/test_live_graphic_conversions.py +++ b/tests/live/test_live_graphic_conversions.py @@ -5,7 +5,7 @@ import pytest -from pdfrest import PdfRestApiError, PdfRestClient +from pdfrest import AsyncPdfRestClient, PdfRestApiError, PdfRestClient from pdfrest.models import PdfRestFile from pdfrest.models._internal import ( BasePdfRestGraphicPayload, @@ -121,6 +121,28 @@ def uploaded_20_page_pdf( return client.files.create_from_paths([resource])[0] +@pytest.mark.asyncio +async def test_live_async_convert_to_png_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = (await client.files.create_from_paths([resource]))[0] + response = await client.convert_to_png( + uploaded, + output_prefix="async-png", + resolution=150, + ) + + assert response.output_files + assert all(file_info.type == "image/png" for file_info in response.output_files) + assert str(response.input_id) == str(uploaded.id) + + @pytest.mark.parametrize( ("_endpoint_label", "spec", "color_model"), _valid_color_cases(), @@ -168,7 +190,7 @@ def test_live_graphic_invalid_color_model( uploaded = client.files.create_from_paths([resource])[0] client_method = getattr(client, spec.method_name) resolution = _resolution_bounds(payload_model)[0] - with pytest.raises(PdfRestApiError): + with pytest.raises(PdfRestApiError, match=r"(?i)color"): client_method( uploaded, resolution=resolution, @@ -213,7 +235,7 @@ def test_live_graphic_resolution_bounds( if should_raise: call_kwargs["extra_body"] = {"resolution": base_resolution + offset} - with pytest.raises(PdfRestApiError): + with pytest.raises(PdfRestApiError, match=r"(?i)resolution"): client_method(uploaded, **call_kwargs) else: response = client_method(uploaded, **call_kwargs) @@ -261,7 +283,7 @@ def test_live_graphic_invalid_smoothing( ) as client: uploaded = client.files.create_from_paths([resource])[0] client_method = getattr(client, spec.method_name) - with pytest.raises(PdfRestApiError): + with pytest.raises(PdfRestApiError, match=r"(?i)smooth"): client_method( uploaded, smoothing="none", @@ -269,6 +291,25 @@ def test_live_graphic_invalid_smoothing( ) +@pytest.mark.asyncio +async def test_live_async_graphic_invalid_smoothing( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = (await client.files.create_from_paths([resource]))[0] + with pytest.raises(PdfRestApiError, match=r"(?i)smooth"): + await client.convert_to_png( + uploaded, + smoothing="none", + extra_body={"smoothing": "super-smooth"}, + ) + + @pytest.mark.parametrize( ("page_range", "expect_success"), [ @@ -316,7 +357,7 @@ def test_live_png_page_range_variants( ) assert str(response.input_id) == str(uploaded_20_page_pdf.id) else: - with pytest.raises(PdfRestApiError): + with pytest.raises(PdfRestApiError, match=r"(?i)page"): client.convert_to_png( uploaded_20_page_pdf, output_prefix=f"live-range-{case_id}", @@ -348,7 +389,10 @@ def test_live_png_page_range_invalid_overrides( api_key=pdfrest_api_key, base_url=pdfrest_live_base_url, ) as client, - pytest.raises(PdfRestApiError), + pytest.raises( + PdfRestApiError, + match=r"There was an issue processing your file\. Validate all fields and try again\.", + ), ): client.convert_to_png( uploaded_20_page_pdf, diff --git a/tests/live/test_live_linearize_pdf.py b/tests/live/test_live_linearize_pdf.py new file mode 100644 index 00000000..523ea0d5 --- /dev/null +++ b/tests/live/test_live_linearize_pdf.py @@ -0,0 +1,115 @@ +from __future__ import annotations + +import pytest + +from pdfrest import AsyncPdfRestClient, PdfRestApiError, PdfRestClient +from pdfrest.models import PdfRestFile + +from ..resources import get_test_resource_path + + +@pytest.fixture(scope="module") +def uploaded_pdf_for_linearize( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> PdfRestFile: + resource = get_test_resource_path("report.pdf") + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + return client.files.create_from_paths([resource])[0] + + +@pytest.mark.parametrize( + "output_name", + [ + pytest.param(None, id="default-output"), + pytest.param("linearized-live", id="custom-output"), + ], +) +def test_live_linearize_pdf( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_linearize: PdfRestFile, + output_name: str | None, +) -> None: + kwargs: dict[str, str] = {} + if output_name is not None: + kwargs["output"] = output_name + + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + response = client.linearize_pdf(uploaded_pdf_for_linearize, **kwargs) + + assert response.output_files + output_file = response.output_file + assert output_file.type == "application/pdf" + assert output_file.size > 0 + assert response.warning is None + assert str(response.input_id) == str(uploaded_pdf_for_linearize.id) + if output_name is not None: + assert output_file.name.startswith(output_name) + else: + assert output_file.name.endswith(".pdf") + + +@pytest.mark.asyncio +async def test_live_async_linearize_pdf( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_linearize: PdfRestFile, +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + response = await client.linearize_pdf( + uploaded_pdf_for_linearize, + output="async-linearized", + ) + + assert response.output_files + output_file = response.output_file + assert output_file.name.startswith("async-linearized") + assert output_file.type == "application/pdf" + assert output_file.size > 0 + assert response.warning is None + assert str(response.input_id) == str(uploaded_pdf_for_linearize.id) + + +def test_live_linearize_pdf_invalid_file_id( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_linearize: PdfRestFile, +) -> None: + with ( + PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client, + pytest.raises(PdfRestApiError, match=r"(?i)(id|file)"), + ): + client.linearize_pdf( + uploaded_pdf_for_linearize, + extra_body={"id": "ffffffff-ffff-ffff-ffff-ffffffffffff"}, + ) + + +@pytest.mark.asyncio +async def test_live_async_linearize_pdf_invalid_file_id( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_linearize: PdfRestFile, +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + with pytest.raises(PdfRestApiError, match=r"(?i)(id|file)"): + await client.linearize_pdf( + uploaded_pdf_for_linearize, + extra_body={"id": "00000000-0000-0000-0000-000000000000"}, + ) diff --git a/tests/live/test_live_ocr_pdf.py b/tests/live/test_live_ocr_pdf.py new file mode 100644 index 00000000..5e9ede14 --- /dev/null +++ b/tests/live/test_live_ocr_pdf.py @@ -0,0 +1,87 @@ +from __future__ import annotations + +import pytest + +from pdfrest import AsyncPdfRestClient, PdfRestApiError, PdfRestClient +from pdfrest.models import PdfRestFileBasedResponse + +from ..resources import get_test_resource_path + + +def test_live_ocr_pdf_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = client.files.create_from_paths([resource])[0] + response = client.ocr_pdf(uploaded, languages=["English", "German"]) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_files + output_file = response.output_file + assert output_file.name.endswith(".pdf") + assert output_file.type == "application/pdf" + assert output_file.size > 0 + assert response.warning is None + assert response.input_id == uploaded.id + + +@pytest.mark.asyncio +async def test_live_async_ocr_pdf_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = (await client.files.create_from_paths([resource]))[0] + response = await client.ocr_pdf(uploaded, output="async-ocr") + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_files + assert response.output_file.name.startswith("async-ocr") + assert response.output_file.type == "application/pdf" + assert response.output_file.size > 0 + assert response.warning is None + assert response.input_id == uploaded.id + + +def test_live_ocr_pdf_invalid_pages( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = client.files.create_from_paths([resource])[0] + with pytest.raises(PdfRestApiError, match=r"(?i)page"): + client.ocr_pdf( + uploaded, + extra_body={"pages": "last-1"}, + ) + + +@pytest.mark.asyncio +async def test_live_async_ocr_pdf_invalid_pages( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = (await client.files.create_from_paths([resource]))[0] + with pytest.raises(PdfRestApiError, match=r"(?i)page"): + await client.ocr_pdf( + uploaded, + extra_body={"pages": "last-1"}, + ) diff --git a/tests/live/test_live_pdf_info.py b/tests/live/test_live_pdf_info.py index 977fe87d..7ec91828 100644 --- a/tests/live/test_live_pdf_info.py +++ b/tests/live/test_live_pdf_info.py @@ -111,7 +111,7 @@ def test_live_pdf_info_invalid_query( PdfRestClient( api_key=pdfrest_api_key, base_url=pdfrest_live_base_url ) as client, - pytest.raises(PdfRestApiError), + pytest.raises(PdfRestApiError, match=r"(?i)quer"), ): client.query_pdf_info( uploaded_pdf, diff --git a/tests/live/test_live_pdf_redactions.py b/tests/live/test_live_pdf_redactions.py index 796785a1..3fda6d42 100644 --- a/tests/live/test_live_pdf_redactions.py +++ b/tests/live/test_live_pdf_redactions.py @@ -4,7 +4,7 @@ import pytest -from pdfrest import PdfRestApiError, PdfRestClient +from pdfrest import AsyncPdfRestClient, PdfRestApiError, PdfRestClient from pdfrest.models import PdfRestFile from pdfrest.types import PdfRedactionInstruction, PdfRedactionPreset @@ -135,6 +135,36 @@ def test_live_redaction_preview_and_apply_multiple( assert final_file.type == "application/pdf" +@pytest.mark.asyncio +async def test_live_async_redaction_preview_and_apply( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_redaction: PdfRestFile, +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + preview = await client.preview_redactions( + uploaded_pdf_for_redaction, + redactions=[{"type": "literal", "value": "quick brown fox"}], + output="async-redaction-preview", + ) + + preview_file = preview.output_files[0] + applied = await client.apply_redactions( + preview_file, + output="async-redaction-final", + ) + + assert preview.output_files + assert preview_file.name.endswith("async-redaction-preview.pdf") + assert applied.output_files + final_file = applied.output_files[0] + assert final_file.name.endswith("async-redaction-final.pdf") + assert final_file.type == "application/pdf" + + @pytest.mark.parametrize( "extra_body", [ @@ -153,7 +183,13 @@ def test_live_redactions_invalid_payloads( base_url=pdfrest_live_base_url, ) as client: if "redactions" in extra_body: - with pytest.raises(PdfRestApiError): + with pytest.raises( + PdfRestApiError, + match=( + r"The JSON data provided is not properly formatted\. Please check " + r"your syntax and try again\." + ), + ): client.preview_redactions( uploaded_pdf_for_redaction, redactions=[{"type": "literal", "value": "placeholder"}], @@ -165,5 +201,23 @@ def test_live_redactions_invalid_payloads( redactions=[{"type": "literal", "value": "placeholder"}], ) preview_file = preview.output_files[0] - with pytest.raises(PdfRestApiError): + with pytest.raises(PdfRestApiError, match=r"(?i)rgb"): client.apply_redactions(preview_file, extra_body=extra_body) + + +@pytest.mark.asyncio +async def test_live_async_redactions_invalid_payloads( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_redaction: PdfRestFile, +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + with pytest.raises(PdfRestApiError, match=r"(?i)rgb"): + await client.preview_redactions( + uploaded_pdf_for_redaction, + redactions=[{"type": "literal", "value": "placeholder"}], + extra_body={"rgb_color": "-1,-1,-1"}, + ) diff --git a/tests/live/test_live_pdf_split_merge.py b/tests/live/test_live_pdf_split_merge.py index 979f7b1e..5a58912c 100644 --- a/tests/live/test_live_pdf_split_merge.py +++ b/tests/live/test_live_pdf_split_merge.py @@ -198,7 +198,7 @@ def test_live_split_pdf_invalid_pages( api_key=pdfrest_api_key, base_url=pdfrest_live_base_url, ) as client, - pytest.raises(PdfRestApiError), + pytest.raises(PdfRestApiError, match=r"(?i)page"), ): client.split_pdf( split_source, @@ -270,7 +270,7 @@ def test_live_merge_pdfs_invalid_pages( api_key=pdfrest_api_key, base_url=pdfrest_live_base_url, ) as client, - pytest.raises(PdfRestApiError), + pytest.raises(PdfRestApiError, match=r"(?i)page"), ): client.merge_pdfs( sources, @@ -373,7 +373,7 @@ def test_live_split_pdf_page_range_variants( output_pages = client.query_pdf_info(response.output_files[0]).page_count assert output_pages == len(expected_pages) else: - with pytest.raises(PdfRestApiError): + with pytest.raises(PdfRestApiError, match=r"(?i)page"): client.split_pdf( split_source, page_groups=[selection if not requires_override else "1"], @@ -446,7 +446,7 @@ def test_live_merge_pdf_page_range_variants( output_info = client.query_pdf_info(response.output_file) assert output_info.page_count == expected_total_pages else: - with pytest.raises(PdfRestApiError): + with pytest.raises(PdfRestApiError, match=r"(?i)page"): client.merge_pdfs( sources, output_prefix=f"live-merge-range-{case_id}", diff --git a/tests/live/test_live_rasterize_pdf.py b/tests/live/test_live_rasterize_pdf.py new file mode 100644 index 00000000..df7cb260 --- /dev/null +++ b/tests/live/test_live_rasterize_pdf.py @@ -0,0 +1,114 @@ +from __future__ import annotations + +import pytest + +from pdfrest import AsyncPdfRestClient, PdfRestApiError, PdfRestClient +from pdfrest.models import PdfRestFile + +from ..resources import get_test_resource_path + + +@pytest.fixture(scope="module") +def uploaded_pdf_for_rasterize( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> PdfRestFile: + resource = get_test_resource_path("report.pdf") + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + return client.files.create_from_paths([resource])[0] + + +@pytest.mark.parametrize( + "output_name", + [ + pytest.param(None, id="default-output"), + pytest.param("rasterized-live", id="custom-output"), + ], +) +def test_live_rasterize_pdf_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_rasterize: PdfRestFile, + output_name: str | None, +) -> None: + kwargs: dict[str, str] = {} + if output_name is not None: + kwargs["output"] = output_name + + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + response = client.rasterize_pdf(uploaded_pdf_for_rasterize, **kwargs) + + assert response.output_files + output_file = response.output_file + assert output_file.type == "application/pdf" + assert output_file.size > 0 + assert response.warning is None + assert str(response.input_id) == str(uploaded_pdf_for_rasterize.id) + if output_name is not None: + assert output_file.name.startswith(output_name) + else: + assert output_file.name.endswith(".pdf") + + +@pytest.mark.asyncio +async def test_live_async_rasterize_pdf_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_rasterize: PdfRestFile, +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + response = await client.rasterize_pdf( + uploaded_pdf_for_rasterize, output="async" + ) + + assert response.output_files + output_file = response.output_file + assert output_file.name.startswith("async") + assert output_file.type == "application/pdf" + assert output_file.size > 0 + assert response.warning is None + assert str(response.input_id) == str(uploaded_pdf_for_rasterize.id) + + +def test_live_rasterize_pdf_invalid_file_id( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_rasterize: PdfRestFile, +) -> None: + with ( + PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client, + pytest.raises(PdfRestApiError, match=r"(?i)(id|file)"), + ): + client.rasterize_pdf( + uploaded_pdf_for_rasterize, + extra_body={"id": "00000000-0000-0000-0000-000000000000"}, + ) + + +@pytest.mark.asyncio +async def test_live_async_rasterize_pdf_invalid_file_id( + pdfrest_api_key: str, + pdfrest_live_base_url: str, + uploaded_pdf_for_rasterize: PdfRestFile, +) -> None: + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + with pytest.raises(PdfRestApiError, match=r"(?i)(id|file)"): + await client.rasterize_pdf( + uploaded_pdf_for_rasterize, + extra_body={"id": "ffffffff-ffff-ffff-ffff-ffffffffffff"}, + ) diff --git a/tests/live/test_live_summarize_pdf_text.py b/tests/live/test_live_summarize_pdf_text.py new file mode 100644 index 00000000..629c815a --- /dev/null +++ b/tests/live/test_live_summarize_pdf_text.py @@ -0,0 +1,112 @@ +from __future__ import annotations + +import pytest + +from pdfrest import AsyncPdfRestClient, PdfRestApiError, PdfRestClient +from pdfrest.models import PdfRestFileBasedResponse, SummarizePdfTextResponse + +from ..resources import get_test_resource_path + + +def test_live_summarize_text_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = client.files.create_from_paths([resource])[0] + response = client.summarize_text( + uploaded, + target_word_count=40, + summary_format="overview", + ) + + assert isinstance(response, SummarizePdfTextResponse) + assert response.summary + assert response.input_id == uploaded.id + + +def test_live_summarize_text_to_file_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = client.files.create_from_paths([resource])[0] + response = client.summarize_text_to_file( + uploaded, + target_word_count=40, + summary_format="overview", + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_files + output_file = response.output_file + assert output_file.name.endswith(".md") + assert output_file.type == "text/markdown" + assert output_file.size > 0 + assert response.warning is None + assert response.input_id == uploaded.id + + +@pytest.mark.asyncio +async def test_live_async_summarize_text_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = (await client.files.create_from_paths([resource]))[0] + response = await client.summarize_text( + uploaded, + target_word_count=30, + summary_format="overview", + ) + + assert isinstance(response, SummarizePdfTextResponse) + assert response.summary + assert response.input_id == uploaded.id + + +def test_live_summarize_text_invalid_format( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = client.files.create_from_paths([resource])[0] + with pytest.raises(PdfRestApiError, match=r"(?i)summary"): + client.summarize_text( + uploaded, + extra_body={"summary_format": "invalid-style"}, + ) + + +@pytest.mark.asyncio +async def test_live_async_summarize_text_invalid_format( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = (await client.files.create_from_paths([resource]))[0] + with pytest.raises(PdfRestApiError, match=r"(?i)summary"): + await client.summarize_text( + uploaded, + extra_body={"summary_format": "invalid-style"}, + ) diff --git a/tests/live/test_live_translate_pdf_text.py b/tests/live/test_live_translate_pdf_text.py new file mode 100644 index 00000000..00701242 --- /dev/null +++ b/tests/live/test_live_translate_pdf_text.py @@ -0,0 +1,128 @@ +from __future__ import annotations + +import pytest + +from pdfrest import AsyncPdfRestClient, PdfRestApiError, PdfRestClient +from pdfrest.models import ( + TranslatePdfTextFileResponse, + TranslatePdfTextResponse, +) + +from ..resources import get_test_resource_path + + +def test_live_translate_pdf_text_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = client.files.create_from_paths([resource])[0] + response = client.translate_pdf_text( + uploaded, + output_language="fr", + output_format="plaintext", + ) + + assert isinstance(response, TranslatePdfTextResponse) + assert response.translated_text + assert response.output_language == "fr" + assert response.source_languages + assert response.input_id == uploaded.id + + +@pytest.mark.asyncio +async def test_live_async_translate_pdf_text_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = (await client.files.create_from_paths([resource]))[0] + response = await client.translate_pdf_text( + uploaded, + output_language="es", + output_format="plaintext", + ) + + assert isinstance(response, TranslatePdfTextResponse) + assert response.translated_text + assert response.output_language == "es" + assert response.input_id == uploaded.id + + +def test_live_translate_pdf_text_invalid_output_format( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = client.files.create_from_paths([resource])[0] + with pytest.raises( + PdfRestApiError, + match=r"invalid-format is not a valid input for 'output_format'", + ): + client.translate_pdf_text( + uploaded, + output_language="es", + extra_body={"output_format": "invalid-format"}, + ) + + +@pytest.mark.asyncio +async def test_live_async_translate_pdf_text_invalid_output_format( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + async with AsyncPdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = (await client.files.create_from_paths([resource]))[0] + with pytest.raises( + PdfRestApiError, + match=r"invalid-format is not a valid input for 'output_format'", + ): + await client.translate_pdf_text( + uploaded, + output_language="de", + extra_body={"output_format": "invalid-format"}, + ) + + +def test_live_translate_pdf_text_file_success( + pdfrest_api_key: str, + pdfrest_live_base_url: str, +) -> None: + resource = get_test_resource_path("report.pdf") + with PdfRestClient( + api_key=pdfrest_api_key, + base_url=pdfrest_live_base_url, + ) as client: + uploaded = client.files.create_from_paths([resource])[0] + response = client.translate_pdf_text_to_file( + uploaded, + output_language="fr", + output_format="plaintext", + ) + + assert isinstance(response, TranslatePdfTextFileResponse) + assert response.output_files + output_file = response.output_file + assert output_file.name.endswith(".txt") + assert output_file.type == "text/plain" + assert output_file.size > 0 + assert response.warning is None + assert response.output_language == "fr" + assert response.source_languages + assert response.input_id == uploaded.id diff --git a/tests/resources/duckhat.pdf b/tests/resources/duckhat.pdf new file mode 100644 index 00000000..8dbaff23 Binary files /dev/null and b/tests/resources/duckhat.pdf differ diff --git a/tests/test_add_attachment_to_pdf.py b/tests/test_add_attachment_to_pdf.py new file mode 100644 index 00000000..bca6afae --- /dev/null +++ b/tests/test_add_attachment_to_pdf.py @@ -0,0 +1,335 @@ +from __future__ import annotations + +import json +from collections.abc import Callable + +import httpx +import pytest +from pydantic import ValidationError + +from pdfrest import AsyncPdfRestClient, PdfRestClient +from pdfrest.models import PdfRestFile, PdfRestFileBasedResponse, PdfRestFileID +from pdfrest.models._internal import PdfAddAttachmentPayload + +from .graphics_test_helpers import ( + ASYNC_API_KEY, + VALID_API_KEY, + build_file_info_payload, + make_pdf_file, +) + + +def make_attachment_file( + file_id: str, + name: str = "attachment.txt", + mime_type: str = "text/plain", +) -> PdfRestFile: + return PdfRestFile.model_validate(build_file_info_payload(file_id, name, mime_type)) + + +def test_add_attachment_to_pdf_success(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + attachment = make_attachment_file(str(PdfRestFileID.generate()), "notes.txt") + output_id = str(PdfRestFileID.generate()) + + payload_dump = PdfAddAttachmentPayload.model_validate( + {"files": [input_file], "attachments": [attachment], "output": "attached"} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if ( + request.method == "POST" + and request.url.path == "/pdf-with-added-attachment" + ): + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "inputId": [input_file.id, attachment.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + assert request.url.params["format"] == "info" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, "attached.pdf", "application/pdf" + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.add_attachment_to_pdf( + input_file, + attachment=attachment, + output="attached", + ) + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "attached.pdf" + assert response.output_file.type == "application/pdf" + assert [str(file_id) for file_id in response.input_ids] == [ + str(input_file.id), + str(attachment.id), + ] + assert response.warning is None + + +@pytest.mark.asyncio +async def test_async_add_attachment_to_pdf_success( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + attachment = make_attachment_file( + str(PdfRestFileID.generate()), + "doc.docx", + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + ) + output_id = str(PdfRestFileID.generate()) + + def handler(request: httpx.Request) -> httpx.Response: + if ( + request.method == "POST" + and request.url.path == "/pdf-with-added-attachment" + ): + payload = json.loads(request.content.decode("utf-8")) + assert payload["id"] == str(input_file.id) + assert payload["id_to_attach"] == str(attachment.id) + assert payload["output"] == "async-attachment" + return httpx.Response( + 200, + json={ + "inputId": [input_file.id, attachment.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "async-attachment.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.add_attachment_to_pdf( + input_file, + attachment=attachment, + output="async-attachment", + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "async-attachment.pdf" + assert [str(file_id) for file_id in response.input_ids] == [ + str(input_file.id), + str(attachment.id), + ] + + +def test_add_attachment_to_pdf_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate()) + attachment = make_attachment_file( + str(PdfRestFileID.generate()), + "image.png", + "image/png", + ) + output_id = str(PdfRestFileID.generate()) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if ( + request.method == "POST" + and request.url.path == "/pdf-with-added-attachment" + ): + assert request.url.params["trace"] == "sync" + assert request.headers["X-Debug"] == "sync" + captured_timeout["value"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + assert payload["id"] == str(input_file.id) + assert payload["id_to_attach"] == str(attachment.id) + assert payload["output"] == "custom-output" + assert payload["diagnostics"] == "on" + return httpx.Response( + 200, + json={ + "inputId": [input_file.id, attachment.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "sync" + assert request.headers["X-Debug"] == "sync" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "custom-output.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.add_attachment_to_pdf( + input_file, + attachment=attachment, + output="custom-output", + extra_query={"trace": "sync"}, + extra_headers={"X-Debug": "sync"}, + extra_body={"diagnostics": "on"}, + timeout=0.5, + ) + + assert response.output_file.name == "custom-output.pdf" + timeout_value = captured_timeout["value"] + assert timeout_value is not None + if isinstance(timeout_value, dict): + assert all(pytest.approx(0.5) == value for value in timeout_value.values()) + else: + assert timeout_value == pytest.approx(0.5) + + +@pytest.mark.asyncio +async def test_async_add_attachment_to_pdf_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + attachment = make_attachment_file( + str(PdfRestFileID.generate()), + "data.json", + "application/json", + ) + output_id = str(PdfRestFileID.generate()) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if ( + request.method == "POST" + and request.url.path == "/pdf-with-added-attachment" + ): + assert request.url.params["trace"] == "async" + assert request.headers["X-Debug"] == "async" + captured_timeout["value"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + assert payload["id"] == str(input_file.id) + assert payload["id_to_attach"] == str(attachment.id) + assert payload["output"] == "async-output" + assert payload["notify"] == "yes" + return httpx.Response( + 200, + json={ + "inputId": [input_file.id, attachment.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "async" + assert request.headers["X-Debug"] == "async" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "async-output.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.add_attachment_to_pdf( + input_file, + attachment=attachment, + output="async-output", + extra_query={"trace": "async"}, + extra_headers={"X-Debug": "async"}, + extra_body={"notify": "yes"}, + timeout=1.25, + ) + + assert response.output_file.name == "async-output.pdf" + timeout_value = captured_timeout["value"] + assert timeout_value is not None + if isinstance(timeout_value, dict): + assert all(pytest.approx(1.25) == value for value in timeout_value.values()) + else: + assert timeout_value == pytest.approx(1.25) + + +def test_add_attachment_to_pdf_requires_pdf_file( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + not_pdf = make_attachment_file( + str(PdfRestFileID.generate()), + "image.png", + "image/png", + ) + attachment = make_attachment_file(str(PdfRestFileID.generate()), "note.txt") + transport = httpx.MockTransport( + lambda request: (_ for _ in ()).throw(RuntimeError("should not send")) + ) + + with ( + PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client, + pytest.raises(ValidationError, match="Must be a PDF file"), + ): + client.add_attachment_to_pdf(not_pdf, attachment=attachment) + + +@pytest.mark.parametrize( + "payload_data", + [ + pytest.param( + lambda pdf, attachment: { + "files": [pdf, make_pdf_file(PdfRestFileID.generate())], + "attachments": [attachment], + }, + id="multiple-input-files", + ), + pytest.param( + lambda pdf, attachment: { + "files": [pdf], + "attachments": [ + attachment, + make_attachment_file(str(PdfRestFileID.generate())), + ], + }, + id="multiple-attachments", + ), + ], +) +def test_add_attachment_to_pdf_rejects_multiple_files( + payload_data: Callable[[PdfRestFile, PdfRestFile], dict[str, object]], +) -> None: + input_file = make_pdf_file(PdfRestFileID.generate()) + attachment = make_attachment_file(str(PdfRestFileID.generate())) + + with pytest.raises(ValidationError): + PdfAddAttachmentPayload.model_validate(payload_data(input_file, attachment)) diff --git a/tests/test_convert_to_excel.py b/tests/test_convert_to_excel.py new file mode 100644 index 00000000..42346aac --- /dev/null +++ b/tests/test_convert_to_excel.py @@ -0,0 +1,275 @@ +from __future__ import annotations + +import json + +import httpx +import pytest +from pydantic import ValidationError + +from pdfrest import AsyncPdfRestClient, PdfRestClient +from pdfrest.models import PdfRestFile, PdfRestFileBasedResponse, PdfRestFileID +from pdfrest.models._internal import PdfToExcelPayload + +from .graphics_test_helpers import ( + ASYNC_API_KEY, + VALID_API_KEY, + build_file_info_payload, + make_pdf_file, +) + + +def test_convert_to_excel_success(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + output_id = str(PdfRestFileID.generate()) + + payload_dump = PdfToExcelPayload.model_validate( + {"files": [input_file], "output": "report"} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/excel": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + assert request.url.params["format"] == "info" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "report.xlsx", + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.convert_to_excel(input_file, output="report") + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + output_file = response.output_file + assert output_file.name == "report.xlsx" + assert ( + output_file.type + == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" + ) + assert response.warning is None + assert str(response.input_id) == str(input_file.id) + + +def test_convert_to_excel_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + output_id = str(PdfRestFileID.generate()) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/excel": + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + captured_timeout["value"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + assert payload["debug"] is True + assert payload["id"] == str(input_file.id) + assert payload["output"] == "custom" + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "custom.xlsx", + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.convert_to_excel( + input_file, + output="custom", + extra_query={"trace": "true"}, + extra_headers={"X-Debug": "sync"}, + extra_body={"debug": True}, + timeout=0.4, + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "custom.xlsx" + timeout_value = captured_timeout["value"] + assert timeout_value is not None + if isinstance(timeout_value, dict): + assert all( + component == pytest.approx(0.4) for component in timeout_value.values() + ) + else: + assert timeout_value == pytest.approx(0.4) + + +@pytest.mark.asyncio +async def test_async_convert_to_excel_success( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + output_id = str(PdfRestFileID.generate()) + + payload_dump = PdfToExcelPayload.model_validate({"files": [input_file]}).model_dump( + mode="json", by_alias=True, exclude_none=True, exclude_unset=True + ) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/excel": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + assert request.url.params["format"] == "info" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "async.xlsx", + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.convert_to_excel(input_file) + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "async.xlsx" + assert ( + response.output_file.type + == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" + ) + assert str(response.input_id) == str(input_file.id) + + +@pytest.mark.asyncio +async def test_async_convert_to_excel_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + output_id = str(PdfRestFileID.generate()) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/excel": + assert request.url.params["trace"] == "async" + assert request.headers["X-Debug"] == "async" + captured_timeout["value"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + assert payload["debug"] == "yes" + assert payload["id"] == str(input_file.id) + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "async" + assert request.headers["X-Debug"] == "async" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "async-custom.xlsx", + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.convert_to_excel( + input_file, + extra_query={"trace": "async"}, + extra_headers={"X-Debug": "async"}, + extra_body={"debug": "yes"}, + timeout=0.55, + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "async-custom.xlsx" + timeout_value = captured_timeout["value"] + assert timeout_value is not None + if isinstance(timeout_value, dict): + assert all( + component == pytest.approx(0.55) for component in timeout_value.values() + ) + else: + assert timeout_value == pytest.approx(0.55) + + +def test_convert_to_excel_validation(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + pdf_file = make_pdf_file(PdfRestFileID.generate(1)) + png_file = PdfRestFile.model_validate( + build_file_info_payload( + PdfRestFileID.generate(), + "example.png", + "image/png", + ) + ) + transport = httpx.MockTransport(lambda request: (_ for _ in ()).throw(RuntimeError)) + + with ( + PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client, + pytest.raises(ValidationError, match="Must be a PDF file"), + ): + client.convert_to_excel(png_file) + + with ( + PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client, + pytest.raises( + ValidationError, match="List should have at most 1 item after validation" + ), + ): + client.convert_to_excel([pdf_file, make_pdf_file(PdfRestFileID.generate())]) diff --git a/tests/test_convert_to_jpeg.py b/tests/test_convert_to_jpeg.py index 46e5f648..d0f5b047 100644 --- a/tests/test_convert_to_jpeg.py +++ b/tests/test_convert_to_jpeg.py @@ -83,7 +83,7 @@ def handler(request: httpx.Request) -> httpx.Response: assert str(output_file.url).endswith(output_id) -def test_convert_to_jpeg_defaults_excluded(monkeypatch: pytest.MonkeyPatch) -> None: +def test_convert_to_jpeg_defaults_included(monkeypatch: pytest.MonkeyPatch) -> None: monkeypatch.delenv("PDFREST_API_KEY", raising=False) input_file = make_pdf_file(PdfRestFileID.generate(1)) output_id = "8e9f0011-2222-4bcd-9f00-abcdefabcdef" @@ -98,7 +98,9 @@ def handler(request: httpx.Request) -> httpx.Response: assert_conversion_payload( payload, request_payload, allowed_extras={"jpeg_quality"} ) - assert "jpeg_quality" not in payload + assert payload["jpeg_quality"] == 75 + assert payload["resolution"] == 300 + assert payload["color_model"] == "rgb" return httpx.Response( 200, json={"inputId": [input_file.id], "outputId": [output_id]}, @@ -457,7 +459,7 @@ def test_convert_to_jpeg_sequence_arguments(monkeypatch: pytest.MonkeyPatch) -> "page_range": "1, 3", "smoothing": "text", } - ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_defaults=True) + ).model_dump(mode="json", by_alias=True, exclude_none=True) seen: dict[str, int] = {"post": 0, "get": 0} diff --git a/tests/test_convert_to_markdown.py b/tests/test_convert_to_markdown.py new file mode 100644 index 00000000..22876eb8 --- /dev/null +++ b/tests/test_convert_to_markdown.py @@ -0,0 +1,247 @@ +from __future__ import annotations + +import json + +import httpx +import pytest +from pydantic import ValidationError + +from pdfrest import AsyncPdfRestClient, PdfRestClient +from pdfrest.models import ( + PdfRestFile, + PdfRestFileBasedResponse, + PdfRestFileID, +) +from pdfrest.models._internal import ConvertToMarkdownPayload + +from .graphics_test_helpers import ASYNC_API_KEY, VALID_API_KEY, make_pdf_file + + +def _make_markdown_file(file_id: str, name: str = "markdown.md") -> PdfRestFile: + return PdfRestFile.model_validate( + { + "id": file_id, + "name": name, + "url": f"https://api.pdfrest.com/resource/{file_id}", + "type": "text/markdown", + "size": 64, + "modified": "2024-01-01T00:00:00Z", + "scheduledDeletionTimeUtc": None, + } + ) + + +def test_convert_to_markdown_payload_rejects_non_pdf() -> None: + file_id = str(PdfRestFileID.generate()) + text_file = PdfRestFile.model_validate( + { + "id": file_id, + "name": "notes.txt", + "url": f"https://api.pdfrest.com/resource/{file_id}", + "type": "text/plain", + "size": 64, + "modified": "2024-01-01T00:00:00Z", + "scheduledDeletionTimeUtc": None, + } + ) + with pytest.raises(ValidationError, match="Must be a PDF file"): + ConvertToMarkdownPayload.model_validate({"files": [text_file]}) + + +def test_convert_to_markdown_payload_invalid_page_range() -> None: + file_repr = make_pdf_file(PdfRestFileID.generate(1)) + with pytest.raises( + ValidationError, match="The start page must be less than or equal to the end" + ): + ConvertToMarkdownPayload.model_validate( + {"files": [file_repr], "pages": ["5-2"]} + ) + + +def test_convert_to_markdown_payload_invalid_page_break_comments() -> None: + file_repr = make_pdf_file(PdfRestFileID.generate(1)) + with pytest.raises(ValidationError, match="Input should be 'on' or 'off'"): + ConvertToMarkdownPayload.model_validate( + {"files": [file_repr], "page_break_comments": "maybe"} + ) + + +def test_convert_to_markdown_success(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + output_id = str(PdfRestFileID.generate()) + payload_dump = ConvertToMarkdownPayload.model_validate( + { + "files": [input_file], + "pages": ["1-3"], + "output": "md", + "output_type": "file", + "page_break_comments": "on", + } + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/markdown": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + for key, value in payload_dump.items(): + assert payload[key] == value + return httpx.Response( + 200, + json={ + "inputId": [str(input_file.id)], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + assert request.url.params["format"] == "info" + return httpx.Response( + 200, + json=_make_markdown_file(output_id).model_dump( + mode="json", by_alias=True + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.convert_to_markdown( + input_file, + pages=["1-3"], + output="md", + page_break_comments="on", + ) + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + assert response.input_id == input_file.id + assert len(response.output_files) == 1 + + +def test_convert_to_markdown_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + payload_dump = ConvertToMarkdownPayload.model_validate( + { + "files": [input_file], + "output_type": "file", + "page_break_comments": "off", + } + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + output_id = str(PdfRestFileID.generate()) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/markdown": + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + captured_timeout["post"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + for key, value in payload_dump.items(): + assert payload[key] == value + assert payload["debug"] is True + return httpx.Response( + 200, + json={ + "inputId": [str(input_file.id)], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + captured_timeout["get"] = request.extensions.get("timeout") + return httpx.Response( + 200, + json=_make_markdown_file(output_id, "debug.md").model_dump( + mode="json", by_alias=True + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.convert_to_markdown( + input_file, + extra_query={"trace": "true"}, + extra_headers={"X-Debug": "sync"}, + extra_body={"debug": True}, + timeout=0.4, + page_break_comments="off", + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert len(response.output_files) == 1 + post_timeout = captured_timeout["post"] + get_timeout = captured_timeout["get"] + assert post_timeout is not None + assert get_timeout is not None + if isinstance(post_timeout, dict): + assert all( + component == pytest.approx(0.4) for component in post_timeout.values() + ) + else: + assert post_timeout == pytest.approx(0.4) + if isinstance(get_timeout, dict): + assert all( + component == pytest.approx(0.4) for component in get_timeout.values() + ) + else: + assert get_timeout == pytest.approx(0.4) + + +@pytest.mark.asyncio +async def test_async_convert_to_markdown_success( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + output_id = str(PdfRestFileID.generate()) + payload_dump = ConvertToMarkdownPayload.model_validate( + {"files": [input_file], "output_type": "file", "page_break_comments": "off"} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/markdown": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + for key, value in payload_dump.items(): + assert payload[key] == value + return httpx.Response( + 200, + json={ + "inputId": [str(input_file.id)], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + assert request.url.params["format"] == "info" + return httpx.Response( + 200, + json=_make_markdown_file(output_id, "async.md").model_dump( + mode="json", by_alias=True + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.convert_to_markdown( + input_file, page_break_comments="off" + ) + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + assert len(response.output_files) == 1 diff --git a/tests/test_convert_to_pdfa.py b/tests/test_convert_to_pdfa.py new file mode 100644 index 00000000..dea42a35 --- /dev/null +++ b/tests/test_convert_to_pdfa.py @@ -0,0 +1,318 @@ +from __future__ import annotations + +import json + +import httpx +import pytest +from pydantic import ValidationError + +from pdfrest import AsyncPdfRestClient, PdfRestClient +from pdfrest.models import PdfRestFile, PdfRestFileBasedResponse, PdfRestFileID +from pdfrest.models._internal import PdfToPdfaPayload +from pdfrest.types import PdfAType + +from .graphics_test_helpers import ( + ASYNC_API_KEY, + VALID_API_KEY, + build_file_info_payload, + make_pdf_file, +) + + +@pytest.mark.parametrize( + "output_type", + [ + pytest.param("PDF/A-1b", id="pdfa-1b"), + pytest.param("PDF/A-2b", id="pdfa-2b"), + pytest.param("PDF/A-2u", id="pdfa-2u"), + pytest.param("PDF/A-3b", id="pdfa-3b"), + pytest.param("PDF/A-3u", id="pdfa-3u"), + ], +) +def test_convert_to_pdfa_success( + monkeypatch: pytest.MonkeyPatch, output_type: PdfAType +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + output_id = str(PdfRestFileID.generate()) + payload_dump = PdfToPdfaPayload.model_validate( + { + "files": [input_file], + "output_type": output_type, + "output": "archive", + "rasterize_if_errors_encountered": "off", + } + ).model_dump(mode="json", by_alias=True, exclude_none=True) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/pdfa": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + assert request.url.params["format"] == "info" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, "archive.pdf", "application/pdf" + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.convert_to_pdfa( + input_file, + output_type=output_type, + output="archive", + ) + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "archive.pdf" + assert response.output_file.type == "application/pdf" + assert str(response.input_id) == str(input_file.id) + assert response.warning is None + + +@pytest.mark.asyncio +@pytest.mark.parametrize( + "output_type", + [ + pytest.param("PDF/A-1b", id="async-pdfa-1b"), + pytest.param("PDF/A-2b", id="async-pdfa-2b"), + pytest.param("PDF/A-2u", id="async-pdfa-2u"), + pytest.param("PDF/A-3b", id="async-pdfa-3b"), + pytest.param("PDF/A-3u", id="async-pdfa-3u"), + ], +) +async def test_async_convert_to_pdfa_success( + monkeypatch: pytest.MonkeyPatch, output_type: PdfAType +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + output_id = str(PdfRestFileID.generate()) + payload_dump = PdfToPdfaPayload.model_validate( + { + "files": [input_file], + "output_type": output_type, + "rasterize_if_errors_encountered": "off", + } + ).model_dump(mode="json", by_alias=True, exclude_none=True) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/pdfa": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + assert request.url.params["format"] == "info" + return httpx.Response( + 200, + json=build_file_info_payload(output_id, "async.pdf", "application/pdf"), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.convert_to_pdfa( + input_file, + output_type=output_type, + ) + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "async.pdf" + assert response.output_file.type == "application/pdf" + assert str(response.input_id) == str(input_file.id) + + +def test_convert_to_pdfa_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + output_id = str(PdfRestFileID.generate()) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/pdfa": + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + captured_timeout["value"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + assert payload["output_type"] == "PDF/A-3b" + assert payload["rasterize_if_errors_encountered"] == "on" + assert payload["debug"] == "yes" + assert payload["id"] == str(input_file.id) + assert payload["output"] == "custom" + return httpx.Response( + 200, + json={"inputId": [input_file.id], "outputId": [output_id]}, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, "custom.pdf", "application/pdf" + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.convert_to_pdfa( + input_file, + output_type="PDF/A-3b", + output="custom", + rasterize_if_errors_encountered="on", + extra_query={"trace": "true"}, + extra_headers={"X-Debug": "sync"}, + extra_body={"debug": "yes"}, + timeout=0.33, + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "custom.pdf" + timeout_value = captured_timeout["value"] + assert timeout_value is not None + if isinstance(timeout_value, dict): + assert all( + component == pytest.approx(0.33) for component in timeout_value.values() + ) + else: + assert timeout_value == pytest.approx(0.33) + + +@pytest.mark.asyncio +async def test_async_convert_to_pdfa_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + output_id = str(PdfRestFileID.generate()) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/pdfa": + assert request.url.params["trace"] == "async" + assert request.headers["X-Debug"] == "async" + captured_timeout["value"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + assert payload["output_type"] == "PDF/A-2u" + assert payload["id"] == str(input_file.id) + assert payload["extra"] == {"note": "async"} + assert payload["rasterize_if_errors_encountered"] == "off" + return httpx.Response( + 200, + json={"inputId": [input_file.id], "outputId": [output_id]}, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "async" + assert request.headers["X-Debug"] == "async" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, "async-custom.pdf", "application/pdf" + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.convert_to_pdfa( + input_file, + output_type="PDF/A-2u", + rasterize_if_errors_encountered="off", + extra_query={"trace": "async"}, + extra_headers={"X-Debug": "async"}, + extra_body={"extra": {"note": "async"}}, + timeout=0.72, + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "async-custom.pdf" + timeout_value = captured_timeout["value"] + assert timeout_value is not None + if isinstance(timeout_value, dict): + assert all( + component == pytest.approx(0.72) for component in timeout_value.values() + ) + else: + assert timeout_value == pytest.approx(0.72) + + +def test_convert_to_pdfa_validation(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + pdf_file = make_pdf_file(PdfRestFileID.generate(1)) + png_file = PdfRestFile.model_validate( + build_file_info_payload( + PdfRestFileID.generate(), + "example.png", + "image/png", + ) + ) + transport = httpx.MockTransport(lambda request: (_ for _ in ()).throw(RuntimeError)) + + with ( + PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client, + pytest.raises( + ValidationError, + match=( + "Input should be 'PDF/A-1b', 'PDF/A-2b', 'PDF/A-2u', " + "'PDF/A-3b' or 'PDF/A-3u'" + ), + ), + ): + client.convert_to_pdfa(pdf_file, output_type=None) # type: ignore[arg-type] + + with ( + PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client, + pytest.raises(ValidationError, match="Must be a PDF file"), + ): + client.convert_to_pdfa(png_file, output_type="PDF/A-2b") + + with ( + PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client, + pytest.raises(ValidationError, match="PDF/A-1b"), + ): + client.convert_to_pdfa(pdf_file, output_type="PDF/A-4") # type: ignore[arg-type] + + with ( + PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client, + pytest.raises( + ValidationError, match="List should have at most 1 item after validation" + ), + ): + client.convert_to_pdfa( + [pdf_file, make_pdf_file(PdfRestFileID.generate())], + output_type="PDF/A-2b", + ) diff --git a/tests/test_convert_to_powerpoint.py b/tests/test_convert_to_powerpoint.py new file mode 100644 index 00000000..a8c1daa0 --- /dev/null +++ b/tests/test_convert_to_powerpoint.py @@ -0,0 +1,277 @@ +from __future__ import annotations + +import json + +import httpx +import pytest +from pydantic import ValidationError + +from pdfrest import AsyncPdfRestClient, PdfRestClient +from pdfrest.models import PdfRestFile, PdfRestFileBasedResponse, PdfRestFileID +from pdfrest.models._internal import PdfToPowerpointPayload + +from .graphics_test_helpers import ( + ASYNC_API_KEY, + VALID_API_KEY, + build_file_info_payload, + make_pdf_file, +) + + +def test_convert_to_powerpoint_success(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + output_id = str(PdfRestFileID.generate()) + + payload_dump = PdfToPowerpointPayload.model_validate( + {"files": [input_file], "output": "slides"} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/powerpoint": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + assert request.url.params["format"] == "info" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "slides.pptx", + "application/vnd.openxmlformats-officedocument.presentationml.presentation", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.convert_to_powerpoint(input_file, output="slides") + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + output_file = response.output_file + assert output_file.name == "slides.pptx" + assert ( + output_file.type + == "application/vnd.openxmlformats-officedocument.presentationml.presentation" + ) + assert response.warning is None + assert str(response.input_id) == str(input_file.id) + + +def test_convert_to_powerpoint_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + output_id = str(PdfRestFileID.generate()) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/powerpoint": + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + captured_timeout["value"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + assert payload["debug"] is True + assert payload["id"] == str(input_file.id) + assert payload["output"] == "custom" + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "custom.pptx", + "application/vnd.openxmlformats-officedocument.presentationml.presentation", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.convert_to_powerpoint( + input_file, + output="custom", + extra_query={"trace": "true"}, + extra_headers={"X-Debug": "sync"}, + extra_body={"debug": True}, + timeout=0.4, + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "custom.pptx" + timeout_value = captured_timeout["value"] + assert timeout_value is not None + if isinstance(timeout_value, dict): + assert all( + component == pytest.approx(0.4) for component in timeout_value.values() + ) + else: + assert timeout_value == pytest.approx(0.4) + + +@pytest.mark.asyncio +async def test_async_convert_to_powerpoint_success( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + output_id = str(PdfRestFileID.generate()) + + payload_dump = PdfToPowerpointPayload.model_validate( + {"files": [input_file]} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/powerpoint": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + assert request.url.params["format"] == "info" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "async.pptx", + "application/vnd.openxmlformats-officedocument.presentationml.presentation", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.convert_to_powerpoint(input_file) + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "async.pptx" + assert ( + response.output_file.type + == "application/vnd.openxmlformats-officedocument.presentationml.presentation" + ) + assert str(response.input_id) == str(input_file.id) + + +@pytest.mark.asyncio +async def test_async_convert_to_powerpoint_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + output_id = str(PdfRestFileID.generate()) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/powerpoint": + assert request.url.params["trace"] == "async" + assert request.headers["X-Debug"] == "async" + captured_timeout["value"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + assert payload["debug"] == "yes" + assert payload["id"] == str(input_file.id) + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "async" + assert request.headers["X-Debug"] == "async" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "async-custom.pptx", + "application/vnd.openxmlformats-officedocument.presentationml.presentation", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.convert_to_powerpoint( + input_file, + extra_query={"trace": "async"}, + extra_headers={"X-Debug": "async"}, + extra_body={"debug": "yes"}, + timeout=0.55, + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "async-custom.pptx" + timeout_value = captured_timeout["value"] + assert timeout_value is not None + if isinstance(timeout_value, dict): + assert all( + component == pytest.approx(0.55) for component in timeout_value.values() + ) + else: + assert timeout_value == pytest.approx(0.55) + + +def test_convert_to_powerpoint_validation(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + pdf_file = make_pdf_file(PdfRestFileID.generate(1)) + png_file = PdfRestFile.model_validate( + build_file_info_payload( + PdfRestFileID.generate(), + "example.png", + "image/png", + ) + ) + transport = httpx.MockTransport(lambda request: (_ for _ in ()).throw(RuntimeError)) + + with ( + PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client, + pytest.raises(ValidationError, match="Must be a PDF file"), + ): + client.convert_to_powerpoint(png_file) + + with ( + PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client, + pytest.raises( + ValidationError, match="List should have at most 1 item after validation" + ), + ): + client.convert_to_powerpoint( + [pdf_file, make_pdf_file(PdfRestFileID.generate())] + ) diff --git a/tests/test_convert_xfa_to_acroforms.py b/tests/test_convert_xfa_to_acroforms.py new file mode 100644 index 00000000..6080d22f --- /dev/null +++ b/tests/test_convert_xfa_to_acroforms.py @@ -0,0 +1,271 @@ +from __future__ import annotations + +import json + +import httpx +import pytest +from pydantic import ValidationError + +from pdfrest import AsyncPdfRestClient, PdfRestClient +from pdfrest.models import PdfRestFile, PdfRestFileBasedResponse, PdfRestFileID +from pdfrest.models._internal import PdfXfaToAcroformsPayload + +from .graphics_test_helpers import ( + ASYNC_API_KEY, + VALID_API_KEY, + build_file_info_payload, + make_pdf_file, +) + + +def test_convert_xfa_to_acroforms_success(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + output_id = str(PdfRestFileID.generate()) + + payload_dump = PdfXfaToAcroformsPayload.model_validate( + {"files": [input_file], "output": "acro"} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/pdf-with-acroforms": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + assert request.url.params["format"] == "info" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "acro.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.convert_xfa_to_acroforms(input_file, output="acro") + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + output_file = response.output_file + assert output_file.name == "acro.pdf" + assert output_file.type == "application/pdf" + assert response.warning is None + assert str(response.input_id) == str(input_file.id) + + +def test_convert_xfa_to_acroforms_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + output_id = str(PdfRestFileID.generate()) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/pdf-with-acroforms": + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + captured_timeout["value"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + assert payload["debug"] == "yes" + assert payload["id"] == str(input_file.id) + assert payload["output"] == "custom" + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "custom.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.convert_xfa_to_acroforms( + input_file, + output="custom", + extra_query={"trace": "true"}, + extra_headers={"X-Debug": "sync"}, + extra_body={"debug": "yes"}, + timeout=0.31, + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "custom.pdf" + timeout_value = captured_timeout["value"] + assert timeout_value is not None + if isinstance(timeout_value, dict): + assert all( + component == pytest.approx(0.31) for component in timeout_value.values() + ) + else: + assert timeout_value == pytest.approx(0.31) + + +@pytest.mark.asyncio +async def test_async_convert_xfa_to_acroforms_success( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + output_id = str(PdfRestFileID.generate()) + + payload_dump = PdfXfaToAcroformsPayload.model_validate( + {"files": [input_file]} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/pdf-with-acroforms": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + assert request.url.params["format"] == "info" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "async.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.convert_xfa_to_acroforms(input_file) + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "async.pdf" + assert response.output_file.type == "application/pdf" + assert str(response.input_id) == str(input_file.id) + + +@pytest.mark.asyncio +async def test_async_convert_xfa_to_acroforms_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + output_id = str(PdfRestFileID.generate()) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/pdf-with-acroforms": + assert request.url.params["trace"] == "async" + assert request.headers["X-Debug"] == "async" + captured_timeout["value"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + assert payload["debug"] == "yes" + assert payload["id"] == str(input_file.id) + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "async" + assert request.headers["X-Debug"] == "async" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "async-custom.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.convert_xfa_to_acroforms( + input_file, + extra_query={"trace": "async"}, + extra_headers={"X-Debug": "async"}, + extra_body={"debug": "yes"}, + timeout=0.52, + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "async-custom.pdf" + timeout_value = captured_timeout["value"] + assert timeout_value is not None + if isinstance(timeout_value, dict): + assert all( + component == pytest.approx(0.52) for component in timeout_value.values() + ) + else: + assert timeout_value == pytest.approx(0.52) + + +def test_convert_xfa_to_acroforms_validation(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + pdf_file = make_pdf_file(PdfRestFileID.generate(1)) + png_file = PdfRestFile.model_validate( + build_file_info_payload( + PdfRestFileID.generate(), + "example.png", + "image/png", + ) + ) + transport = httpx.MockTransport(lambda request: (_ for _ in ()).throw(RuntimeError)) + + with ( + PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client, + pytest.raises(ValidationError, match="Must be a PDF file"), + ): + client.convert_xfa_to_acroforms(png_file) + + with ( + PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client, + pytest.raises( + ValidationError, match="List should have at most 1 item after validation" + ), + ): + client.convert_xfa_to_acroforms( + [pdf_file, make_pdf_file(PdfRestFileID.generate())] + ) diff --git a/tests/test_extract_images.py b/tests/test_extract_images.py new file mode 100644 index 00000000..5dea441b --- /dev/null +++ b/tests/test_extract_images.py @@ -0,0 +1,211 @@ +from __future__ import annotations + +import json + +import httpx +import pytest +from pydantic import ValidationError + +from pdfrest import AsyncPdfRestClient, PdfRestClient +from pdfrest.models import PdfRestFile, PdfRestFileBasedResponse, PdfRestFileID +from pdfrest.models._internal import ExtractImagesPayload + +from .graphics_test_helpers import ASYNC_API_KEY, VALID_API_KEY, make_pdf_file + + +def _make_png_file(file_id: str, name: str) -> PdfRestFile: + return PdfRestFile.model_validate( + { + "id": file_id, + "name": name, + "url": f"https://api.pdfrest.com/resource/{file_id}", + "type": "image/png", + "size": 10, + "modified": "2024-01-01T00:00:00Z", + "scheduledDeletionTimeUtc": None, + } + ) + + +def test_extract_images_payload_rejects_non_pdf() -> None: + file_id = str(PdfRestFileID.generate()) + text_file = PdfRestFile.model_validate( + { + "id": file_id, + "name": "notes.txt", + "url": f"https://api.pdfrest.com/resource/{file_id}", + "type": "text/plain", + "size": 64, + "modified": "2024-01-01T00:00:00Z", + "scheduledDeletionTimeUtc": None, + } + ) + with pytest.raises(ValidationError, match="Must be a PDF file"): + ExtractImagesPayload.model_validate({"files": [text_file]}) + + +def test_extract_images_payload_invalid_page_range() -> None: + file_repr = make_pdf_file(PdfRestFileID.generate(1)) + with pytest.raises( + ValidationError, match="The start page must be less than or equal to the end" + ): + ExtractImagesPayload.model_validate({"files": [file_repr], "pages": ["5-2"]}) + + +def test_extract_images_success(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + output_id_1 = str(PdfRestFileID.generate()) + output_id_2 = str(PdfRestFileID.generate()) + + payload_dump = ExtractImagesPayload.model_validate( + {"files": [input_file], "pages": ["1-3"], "output": "images"} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/extracted-images": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "inputId": [str(input_file.id)], + "outputId": [output_id_1, output_id_2], + }, + ) + if request.method == "GET" and request.url.path in { + f"/resource/{output_id_1}", + f"/resource/{output_id_2}", + }: + seen["get"] += 1 + return httpx.Response( + 200, + json=_make_png_file( + output_id_1 + if request.url.path.endswith(output_id_1) + else output_id_2, + "image.png", + ).model_dump(mode="json", by_alias=True), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.extract_images(input_file, pages=["1-3"], output="images") + + assert seen == {"post": 1, "get": 2} + assert isinstance(response, PdfRestFileBasedResponse) + assert len(response.output_files) == 2 + assert response.input_id == input_file.id + + +def test_extract_images_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + output_id = str(PdfRestFileID.generate()) + payload_dump = ExtractImagesPayload.model_validate( + {"files": [input_file], "pages": ["1-last"]} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/extracted-images": + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + captured_timeout["value"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump | {"debug": True} + return httpx.Response( + 200, + json={ + "inputId": str(input_file.id), + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + return httpx.Response( + 200, + json=_make_png_file(output_id, "debug.png").model_dump( + mode="json", by_alias=True + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.extract_images( + input_file, + pages=["1-last"], + extra_query={"trace": "true"}, + extra_headers={"X-Debug": "sync"}, + extra_body={"debug": True}, + timeout=0.3, + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert len(response.output_files) == 1 + timeout_value = captured_timeout["value"] + assert timeout_value is not None + if isinstance(timeout_value, dict): + assert all( + component == pytest.approx(0.3) for component in timeout_value.values() + ) + else: + assert timeout_value == pytest.approx(0.3) + + +@pytest.mark.asyncio +async def test_async_extract_images_success( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + output_id = str(PdfRestFileID.generate()) + + payload_dump = ExtractImagesPayload.model_validate( + {"files": [input_file]} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/extracted-images": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "inputId": [str(input_file.id)], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + return httpx.Response( + 200, + json=_make_png_file(output_id, "async.png").model_dump( + mode="json", by_alias=True + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.extract_images(input_file) + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + assert len(response.output_files) == 1 + assert response.input_id == input_file.id diff --git a/tests/test_extract_pdf_text_to_file.py b/tests/test_extract_pdf_text_to_file.py new file mode 100644 index 00000000..a2ad457c --- /dev/null +++ b/tests/test_extract_pdf_text_to_file.py @@ -0,0 +1,240 @@ +from __future__ import annotations + +import json + +import httpx +import pytest +from pydantic import ValidationError + +from pdfrest import AsyncPdfRestClient, PdfRestClient +from pdfrest.models import PdfRestFile, PdfRestFileBasedResponse, PdfRestFileID +from pdfrest.models._internal import ExtractTextPayload + +from .graphics_test_helpers import ASYNC_API_KEY, VALID_API_KEY, make_pdf_file + + +def _make_text_file(file_id: str, name: str = "extracted.txt") -> PdfRestFile: + return PdfRestFile.model_validate( + { + "id": file_id, + "name": name, + "url": f"https://api.pdfrest.com/resource/{file_id}", + "type": "text/plain", + "size": 64, + "modified": "2024-01-01T00:00:00Z", + "scheduledDeletionTimeUtc": None, + } + ) + + +def test_extract_pdf_text_payload_rejects_non_pdf() -> None: + file_id = str(PdfRestFileID.generate()) + text_file = PdfRestFile.model_validate( + { + "id": file_id, + "name": "notes.txt", + "url": f"https://api.pdfrest.com/resource/{file_id}", + "type": "text/plain", + "size": 64, + "modified": "2024-01-01T00:00:00Z", + "scheduledDeletionTimeUtc": None, + } + ) + with pytest.raises(ValidationError, match="Must be a PDF file"): + ExtractTextPayload.model_validate({"files": [text_file]}) + + +def test_extract_pdf_text_payload_invalid_page_range() -> None: + file_repr = make_pdf_file(PdfRestFileID.generate(1)) + with pytest.raises( + ValidationError, match="The start page must be less than or equal to the end" + ): + ExtractTextPayload.model_validate({"files": [file_repr], "pages": ["5-2"]}) + + +def test_extract_pdf_text_to_file_success(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + output_id = str(PdfRestFileID.generate()) + payload_dump = ExtractTextPayload.model_validate( + { + "files": [input_file], + "pages": ["1-3"], + "output": "text", + "full_text": "document", + "preserve_line_breaks": "off", + "word_style": "off", + "word_coordinates": "off", + "output_type": "file", + } + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/extracted-text": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "inputId": [str(input_file.id)], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + assert request.url.params["format"] == "info" + return httpx.Response( + 200, + json=_make_text_file(output_id).model_dump(mode="json", by_alias=True), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.extract_pdf_text_to_file( + input_file, + pages=["1-3"], + output="text", + ) + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + assert response.input_id == input_file.id + assert len(response.output_files) == 1 + assert response.output_file.id == output_id + + +def test_extract_pdf_text_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + output_id = str(PdfRestFileID.generate()) + payload_dump = ExtractTextPayload.model_validate( + { + "files": [input_file], + "output": "file-output", + "full_text": "document", + "preserve_line_breaks": "off", + "word_style": "off", + "word_coordinates": "off", + "output_type": "file", + } + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/extracted-text": + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + captured_timeout["post"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump | {"debug": True} + return httpx.Response( + 200, + json={ + "inputId": [str(input_file.id)], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + captured_timeout["get"] = request.extensions.get("timeout") + return httpx.Response( + 200, + json=_make_text_file(output_id, "debug.txt").model_dump( + mode="json", by_alias=True + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.extract_pdf_text_to_file( + input_file, + output="file-output", + extra_query={"trace": "true"}, + extra_headers={"X-Debug": "sync"}, + extra_body={"debug": True}, + timeout=0.35, + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert len(response.output_files) == 1 + post_timeout = captured_timeout["post"] + get_timeout = captured_timeout["get"] + assert post_timeout is not None + assert get_timeout is not None + if isinstance(post_timeout, dict): + assert all( + component == pytest.approx(0.35) for component in post_timeout.values() + ) + else: + assert post_timeout == pytest.approx(0.35) + if isinstance(get_timeout, dict): + assert all( + component == pytest.approx(0.35) for component in get_timeout.values() + ) + else: + assert get_timeout == pytest.approx(0.35) + + +@pytest.mark.asyncio +async def test_async_extract_pdf_text_to_file_success( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + output_id = str(PdfRestFileID.generate()) + payload_dump = ExtractTextPayload.model_validate( + { + "files": [input_file], + "full_text": "document", + "preserve_line_breaks": "off", + "word_style": "off", + "word_coordinates": "off", + "output_type": "file", + } + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/extracted-text": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "inputId": [str(input_file.id)], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + assert request.url.params["format"] == "info" + return httpx.Response( + 200, + json=_make_text_file(output_id, "async.txt").model_dump( + mode="json", by_alias=True + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.extract_pdf_text_to_file(input_file) + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + assert len(response.output_files) == 1 + assert response.input_id == input_file.id diff --git a/tests/test_flatten_annotations.py b/tests/test_flatten_annotations.py new file mode 100644 index 00000000..d5407a3d --- /dev/null +++ b/tests/test_flatten_annotations.py @@ -0,0 +1,281 @@ +from __future__ import annotations + +import json + +import httpx +import pytest +from pydantic import ValidationError + +from pdfrest import AsyncPdfRestClient, PdfRestClient +from pdfrest.models import PdfRestFile, PdfRestFileBasedResponse, PdfRestFileID +from pdfrest.models._internal import PdfFlattenAnnotationsPayload + +from .graphics_test_helpers import ( + ASYNC_API_KEY, + VALID_API_KEY, + build_file_info_payload, + make_pdf_file, +) + + +def test_flatten_annotations_success(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + output_id = str(PdfRestFileID.generate()) + + payload_dump = PdfFlattenAnnotationsPayload.model_validate( + {"files": [input_file], "output": "flattened"} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if ( + request.method == "POST" + and request.url.path == "/flattened-annotations-pdf" + ): + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + assert request.url.params["format"] == "info" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "flattened.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.flatten_annotations(input_file, output="flattened") + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + output_file = response.output_file + assert output_file.name == "flattened.pdf" + assert output_file.type == "application/pdf" + assert response.warning is None + assert str(response.input_id) == str(input_file.id) + + +def test_flatten_annotations_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + output_id = str(PdfRestFileID.generate()) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if ( + request.method == "POST" + and request.url.path == "/flattened-annotations-pdf" + ): + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + captured_timeout["value"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + assert payload["debug"] == "yes" + assert payload["id"] == str(input_file.id) + assert payload["output"] == "custom" + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "custom.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.flatten_annotations( + input_file, + output="custom", + extra_query={"trace": "true"}, + extra_headers={"X-Debug": "sync"}, + extra_body={"debug": "yes"}, + timeout=0.29, + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "custom.pdf" + timeout_value = captured_timeout["value"] + assert timeout_value is not None + if isinstance(timeout_value, dict): + assert all( + component == pytest.approx(0.29) for component in timeout_value.values() + ) + else: + assert timeout_value == pytest.approx(0.29) + + +@pytest.mark.asyncio +async def test_async_flatten_annotations_success( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + output_id = str(PdfRestFileID.generate()) + + payload_dump = PdfFlattenAnnotationsPayload.model_validate( + {"files": [input_file]} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if ( + request.method == "POST" + and request.url.path == "/flattened-annotations-pdf" + ): + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + assert request.url.params["format"] == "info" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "async.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.flatten_annotations(input_file) + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "async.pdf" + assert response.output_file.type == "application/pdf" + assert str(response.input_id) == str(input_file.id) + + +@pytest.mark.asyncio +async def test_async_flatten_annotations_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + output_id = str(PdfRestFileID.generate()) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if ( + request.method == "POST" + and request.url.path == "/flattened-annotations-pdf" + ): + assert request.url.params["trace"] == "async" + assert request.headers["X-Debug"] == "async" + captured_timeout["value"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + assert payload["debug"] == "yes" + assert payload["id"] == str(input_file.id) + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "async" + assert request.headers["X-Debug"] == "async" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "async-custom.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.flatten_annotations( + input_file, + extra_query={"trace": "async"}, + extra_headers={"X-Debug": "async"}, + extra_body={"debug": "yes"}, + timeout=0.52, + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "async-custom.pdf" + timeout_value = captured_timeout["value"] + assert timeout_value is not None + if isinstance(timeout_value, dict): + assert all( + component == pytest.approx(0.52) for component in timeout_value.values() + ) + else: + assert timeout_value == pytest.approx(0.52) + + +def test_flatten_annotations_validation(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + pdf_file = make_pdf_file(PdfRestFileID.generate(1)) + png_file = PdfRestFile.model_validate( + build_file_info_payload( + PdfRestFileID.generate(), + "example.png", + "image/png", + ) + ) + transport = httpx.MockTransport(lambda request: (_ for _ in ()).throw(RuntimeError)) + + with ( + PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client, + pytest.raises(ValidationError, match="Must be a PDF file"), + ): + client.flatten_annotations(png_file) + + with ( + PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client, + pytest.raises( + ValidationError, match="List should have at most 1 item after validation" + ), + ): + client.flatten_annotations([pdf_file, make_pdf_file(PdfRestFileID.generate())]) diff --git a/tests/test_flatten_transparencies.py b/tests/test_flatten_transparencies.py new file mode 100644 index 00000000..0035fd70 --- /dev/null +++ b/tests/test_flatten_transparencies.py @@ -0,0 +1,297 @@ +from __future__ import annotations + +import json + +import httpx +import pytest +from pydantic import ValidationError + +from pdfrest import AsyncPdfRestClient, PdfRestClient +from pdfrest.models import PdfRestFile, PdfRestFileBasedResponse, PdfRestFileID +from pdfrest.models._internal import PdfFlattenTransparenciesPayload + +from .graphics_test_helpers import ( + ASYNC_API_KEY, + VALID_API_KEY, + build_file_info_payload, + make_pdf_file, +) + + +def test_flatten_transparencies_success(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + output_id = str(PdfRestFileID.generate()) + + payload_dump = PdfFlattenTransparenciesPayload.model_validate( + {"files": [input_file], "output": "flattened", "quality": "high"} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if ( + request.method == "POST" + and request.url.path == "/flattened-transparencies-pdf" + ): + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + assert request.url.params["format"] == "info" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "flattened.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.flatten_transparencies( + input_file, output="flattened", quality="high" + ) + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + output_file = response.output_file + assert output_file.name == "flattened.pdf" + assert output_file.type == "application/pdf" + assert response.warning is None + assert str(response.input_id) == str(input_file.id) + + +def test_flatten_transparencies_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + output_id = str(PdfRestFileID.generate()) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if ( + request.method == "POST" + and request.url.path == "/flattened-transparencies-pdf" + ): + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + captured_timeout["value"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + assert payload["debug"] == "yes" + assert payload["id"] == str(input_file.id) + assert payload["output"] == "custom" + assert payload["quality"] == "low" + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "custom.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.flatten_transparencies( + input_file, + output="custom", + quality="low", + extra_query={"trace": "true"}, + extra_headers={"X-Debug": "sync"}, + extra_body={"debug": "yes"}, + timeout=0.29, + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "custom.pdf" + timeout_value = captured_timeout["value"] + assert timeout_value is not None + if isinstance(timeout_value, dict): + assert all( + component == pytest.approx(0.29) for component in timeout_value.values() + ) + else: + assert timeout_value == pytest.approx(0.29) + + +@pytest.mark.asyncio +async def test_async_flatten_transparencies_success( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + output_id = str(PdfRestFileID.generate()) + + payload_dump = PdfFlattenTransparenciesPayload.model_validate( + {"files": [input_file], "quality": "medium"} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if ( + request.method == "POST" + and request.url.path == "/flattened-transparencies-pdf" + ): + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + assert request.url.params["format"] == "info" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "async.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.flatten_transparencies(input_file) + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "async.pdf" + assert response.output_file.type == "application/pdf" + assert str(response.input_id) == str(input_file.id) + + +@pytest.mark.asyncio +async def test_async_flatten_transparencies_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + output_id = str(PdfRestFileID.generate()) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if ( + request.method == "POST" + and request.url.path == "/flattened-transparencies-pdf" + ): + assert request.url.params["trace"] == "async" + assert request.headers["X-Debug"] == "async" + captured_timeout["value"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + assert payload["debug"] == "yes" + assert payload["id"] == str(input_file.id) + assert payload["quality"] == "high" + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "async" + assert request.headers["X-Debug"] == "async" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "async-custom.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.flatten_transparencies( + input_file, + quality="high", + extra_query={"trace": "async"}, + extra_headers={"X-Debug": "async"}, + extra_body={"debug": "yes"}, + timeout=0.52, + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "async-custom.pdf" + timeout_value = captured_timeout["value"] + assert timeout_value is not None + if isinstance(timeout_value, dict): + assert all( + component == pytest.approx(0.52) for component in timeout_value.values() + ) + else: + assert timeout_value == pytest.approx(0.52) + + +def test_flatten_transparencies_validation(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + pdf_file = make_pdf_file(PdfRestFileID.generate(1)) + png_file = PdfRestFile.model_validate( + build_file_info_payload( + PdfRestFileID.generate(), + "example.png", + "image/png", + ) + ) + transport = httpx.MockTransport(lambda request: (_ for _ in ()).throw(RuntimeError)) + + with ( + PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client, + pytest.raises(ValidationError, match="Must be a PDF file"), + ): + client.flatten_transparencies(png_file) + + with ( + PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client, + pytest.raises( + ValidationError, match="List should have at most 1 item after validation" + ), + ): + client.flatten_transparencies( + [pdf_file, make_pdf_file(PdfRestFileID.generate())] + ) + + with ( + PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client, + pytest.raises( + ValidationError, match="Input should be 'low', 'medium' or 'high'" + ), + ): + client.flatten_transparencies(pdf_file, quality="ultra") # type: ignore[arg-type] diff --git a/tests/test_linearize_pdf.py b/tests/test_linearize_pdf.py new file mode 100644 index 00000000..6b212437 --- /dev/null +++ b/tests/test_linearize_pdf.py @@ -0,0 +1,282 @@ +from __future__ import annotations + +import json + +import httpx +import pytest +from pydantic import ValidationError + +from pdfrest import AsyncPdfRestClient, PdfRestClient +from pdfrest.models import PdfRestFile, PdfRestFileBasedResponse, PdfRestFileID +from pdfrest.models._internal import PdfLinearizePayload + +from .graphics_test_helpers import ( + ASYNC_API_KEY, + VALID_API_KEY, + build_file_info_payload, + make_pdf_file, +) + + +def test_linearize_pdf_success(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + output_id = str(PdfRestFileID.generate()) + + payload_dump = PdfLinearizePayload.model_validate( + {"files": [input_file]} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/linearized-pdf": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + assert request.url.params["format"] == "info" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "linearized.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.linearize_pdf(input_file) + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "linearized.pdf" + assert response.output_file.type == "application/pdf" + assert str(response.input_id) == str(input_file.id) + assert response.warning is None + + +def test_linearize_pdf_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + output_id = str(PdfRestFileID.generate()) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/linearized-pdf": + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + captured_timeout["value"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + assert payload["debug"] == "yes" + assert payload["id"] == str(input_file.id) + assert payload["output"] == "linearized" + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "linearized-out.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.linearize_pdf( + input_file, + output="linearized", + extra_query={"trace": "true"}, + extra_headers={"X-Debug": "sync"}, + extra_body={"debug": "yes"}, + timeout=0.61, + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "linearized-out.pdf" + timeout_value = captured_timeout["value"] + assert timeout_value is not None + if isinstance(timeout_value, dict): + assert all( + component == pytest.approx(0.61) for component in timeout_value.values() + ) + else: + assert timeout_value == pytest.approx(0.61) + + +@pytest.mark.asyncio +async def test_async_linearize_pdf_success(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + output_id = str(PdfRestFileID.generate()) + + payload_dump = PdfLinearizePayload.model_validate( + {"files": [input_file]} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/linearized-pdf": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + assert request.url.params["format"] == "info" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "async-linearized.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.linearize_pdf(input_file) + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "async-linearized.pdf" + assert response.output_file.type == "application/pdf" + assert str(response.input_id) == str(input_file.id) + + +@pytest.mark.asyncio +async def test_async_linearize_pdf_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + output_id = str(PdfRestFileID.generate()) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/linearized-pdf": + assert request.url.params["trace"] == "async" + assert request.headers["X-Debug"] == "async" + captured_timeout["value"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + assert payload["flags"] == ["a", "b"] + assert payload["id"] == str(input_file.id) + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "async" + assert request.headers["X-Debug"] == "async" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "async-linearized-custom.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.linearize_pdf( + input_file, + extra_query={"trace": "async"}, + extra_headers={"X-Debug": "async"}, + extra_body={"flags": ["a", "b"]}, + timeout=0.83, + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "async-linearized-custom.pdf" + timeout_value = captured_timeout["value"] + assert timeout_value is not None + if isinstance(timeout_value, dict): + assert all( + component == pytest.approx(0.83) for component in timeout_value.values() + ) + else: + assert timeout_value == pytest.approx(0.83) + + +@pytest.mark.parametrize( + ("files", "match"), + [ + pytest.param( + "png", + "Must be a PDF file", + id="non-pdf-file", + ), + pytest.param( + "multiple", + "List should have at most 1 item after validation", + id="multiple-files", + ), + ], +) +def test_linearize_pdf_validation( + monkeypatch: pytest.MonkeyPatch, + files: str, + match: str, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + pdf_file = make_pdf_file(PdfRestFileID.generate(1)) + png_file = PdfRestFile.model_validate( + build_file_info_payload( + PdfRestFileID.generate(), + "example.png", + "image/png", + ) + ) + transport = httpx.MockTransport(lambda request: (_ for _ in ()).throw(RuntimeError)) + files_argument = ( + png_file + if files == "png" + else [pdf_file, make_pdf_file(PdfRestFileID.generate())] + ) + + with ( + PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client, + pytest.raises(ValidationError, match=match), + ): + client.linearize_pdf(files_argument) diff --git a/tests/test_ocr_pdf.py b/tests/test_ocr_pdf.py new file mode 100644 index 00000000..625f92f4 --- /dev/null +++ b/tests/test_ocr_pdf.py @@ -0,0 +1,217 @@ +from __future__ import annotations + +import json + +import httpx +import pytest +from pydantic import ValidationError + +from pdfrest import AsyncPdfRestClient, PdfRestClient +from pdfrest.models import PdfRestFile, PdfRestFileBasedResponse, PdfRestFileID +from pdfrest.models._internal import OcrPdfPayload + +from .graphics_test_helpers import ASYNC_API_KEY, VALID_API_KEY, make_pdf_file + + +def test_ocr_payload_rejects_non_pdf() -> None: + file_id = str(PdfRestFileID.generate()) + text_file = PdfRestFile.model_validate( + { + "id": file_id, + "name": "notes.txt", + "url": f"https://api.pdfrest.com/resource/{file_id}", + "type": "text/plain", + "size": 64, + "modified": "2024-01-01T00:00:00Z", + "scheduledDeletionTimeUtc": None, + } + ) + with pytest.raises(ValidationError, match="Must be a PDF file"): + OcrPdfPayload.model_validate({"files": [text_file]}) + + +def test_ocr_payload_invalid_page_range() -> None: + file_repr = make_pdf_file(PdfRestFileID.generate(1)) + with pytest.raises( + ValidationError, match="The start page must be less than or equal to the end" + ): + OcrPdfPayload.model_validate({"files": [file_repr], "pages": ["5-2"]}) + + +def test_ocr_payload_languages() -> None: + file_repr = make_pdf_file(PdfRestFileID.generate(1)) + payload = OcrPdfPayload.model_validate( + {"files": [file_repr], "languages": ["English", "German"]} + ) + assert payload.languages == ["English", "German"] + assert ( + payload.model_dump( + mode="json", by_alias=True, exclude_none=True, exclude_unset=True + )["languages"] + == "English,German" + ) + + +def test_ocr_payload_invalid_language() -> None: + file_repr = make_pdf_file(PdfRestFileID.generate(1)) + with pytest.raises(ValidationError, match="ChineseSimplified"): + OcrPdfPayload.model_validate({"files": [file_repr], "languages": ["Klingon"]}) + + +def test_ocr_pdf_success(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + payload_dump = OcrPdfPayload.model_validate( + { + "files": [input_file], + "pages": ["1-3"], + "output": "ocr", + "languages": ["English"], + } + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + output_id = str(PdfRestFileID.generate()) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/pdf-with-ocr-text": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "inputId": str(input_file.id), + "outputId": output_id, + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + return httpx.Response( + 200, + json=make_pdf_file(output_id, "ocr.pdf").model_dump( + mode="json", by_alias=True + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.ocr_pdf( + input_file, + pages=["1-3"], + output="ocr", + ) + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.id == output_id + assert response.output_file.name == "ocr.pdf" + assert response.input_id == input_file.id + + +def test_ocr_pdf_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + payload_dump = OcrPdfPayload.model_validate( + {"files": [input_file], "languages": ["English"]} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + output_id = str(PdfRestFileID.generate()) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/pdf-with-ocr-text": + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + captured_timeout["value"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump | {"debug": True} + return httpx.Response( + 200, + json={ + "outputId": output_id, + "inputId": str(input_file.id), + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + return httpx.Response( + 200, + json=make_pdf_file(output_id, "custom-ocr.pdf").model_dump( + mode="json", by_alias=True + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.ocr_pdf( + input_file, + extra_query={"trace": "true"}, + extra_headers={"X-Debug": "sync"}, + extra_body={"debug": True}, + timeout=0.4, + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.id == output_id + timeout_value = captured_timeout["value"] + assert timeout_value is not None + if isinstance(timeout_value, dict): + assert all( + component == pytest.approx(0.4) for component in timeout_value.values() + ) + else: + assert timeout_value == pytest.approx(0.4) + + +@pytest.mark.asyncio +async def test_async_ocr_pdf_success( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + payload_dump = OcrPdfPayload.model_validate( + {"files": [input_file], "languages": ["English"]} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + output_id = str(PdfRestFileID.generate()) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/pdf-with-ocr-text": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "outputId": output_id, + "inputId": str(input_file.id), + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + return httpx.Response( + 200, + json=make_pdf_file(output_id, "async-ocr.pdf").model_dump( + mode="json", by_alias=True + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.ocr_pdf(input_file) + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.id == output_id + assert response.input_id == input_file.id diff --git a/tests/test_rasterize_pdf.py b/tests/test_rasterize_pdf.py new file mode 100644 index 00000000..707ab223 --- /dev/null +++ b/tests/test_rasterize_pdf.py @@ -0,0 +1,265 @@ +from __future__ import annotations + +import json + +import httpx +import pytest +from pydantic import ValidationError + +from pdfrest import AsyncPdfRestClient, PdfRestClient +from pdfrest.models import PdfRestFile, PdfRestFileBasedResponse, PdfRestFileID +from pdfrest.models._internal import PdfRasterizePayload + +from .graphics_test_helpers import ( + ASYNC_API_KEY, + VALID_API_KEY, + build_file_info_payload, + make_pdf_file, +) + + +def test_rasterize_pdf_success(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + output_id = str(PdfRestFileID.generate()) + + payload_dump = PdfRasterizePayload.model_validate( + {"files": [input_file], "output": "rasterized"} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/rasterized-pdf": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + assert request.url.params["format"] == "info" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "rasterized.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.rasterize_pdf(input_file, output="rasterized") + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + output_file = response.output_file + assert output_file.name == "rasterized.pdf" + assert output_file.type == "application/pdf" + assert response.warning is None + assert str(response.input_id) == str(input_file.id) + + +def test_rasterize_pdf_request_customization(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + output_id = str(PdfRestFileID.generate()) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/rasterized-pdf": + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + captured_timeout["value"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + assert payload["debug"] == "yes" + assert payload["id"] == str(input_file.id) + assert payload["output"] == "custom" + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "custom.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.rasterize_pdf( + input_file, + output="custom", + extra_query={"trace": "true"}, + extra_headers={"X-Debug": "sync"}, + extra_body={"debug": "yes"}, + timeout=0.31, + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "custom.pdf" + timeout_value = captured_timeout["value"] + assert timeout_value is not None + if isinstance(timeout_value, dict): + assert all( + component == pytest.approx(0.31) for component in timeout_value.values() + ) + else: + assert timeout_value == pytest.approx(0.31) + + +@pytest.mark.asyncio +async def test_async_rasterize_pdf_success(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + output_id = str(PdfRestFileID.generate()) + + payload_dump = PdfRasterizePayload.model_validate( + {"files": [input_file]} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/rasterized-pdf": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + assert request.url.params["format"] == "info" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "async.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.rasterize_pdf(input_file) + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "async.pdf" + assert response.output_file.type == "application/pdf" + assert str(response.input_id) == str(input_file.id) + + +@pytest.mark.asyncio +async def test_async_rasterize_pdf_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + output_id = str(PdfRestFileID.generate()) + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/rasterized-pdf": + assert request.url.params["trace"] == "async" + assert request.headers["X-Debug"] == "async" + captured_timeout["value"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + assert payload["debug"] == "yes" + assert payload["id"] == str(input_file.id) + return httpx.Response( + 200, + json={ + "inputId": [input_file.id], + "outputId": [output_id], + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "async" + assert request.headers["X-Debug"] == "async" + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, + "async-custom.pdf", + "application/pdf", + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.rasterize_pdf( + input_file, + extra_query={"trace": "async"}, + extra_headers={"X-Debug": "async"}, + extra_body={"debug": "yes"}, + timeout=0.52, + ) + + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.name == "async-custom.pdf" + timeout_value = captured_timeout["value"] + assert timeout_value is not None + if isinstance(timeout_value, dict): + assert all( + component == pytest.approx(0.52) for component in timeout_value.values() + ) + else: + assert timeout_value == pytest.approx(0.52) + + +def test_rasterize_pdf_validation(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + pdf_file = make_pdf_file(PdfRestFileID.generate(1)) + png_file = PdfRestFile.model_validate( + build_file_info_payload( + PdfRestFileID.generate(), + "example.png", + "image/png", + ) + ) + transport = httpx.MockTransport(lambda request: (_ for _ in ()).throw(RuntimeError)) + + with ( + PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client, + pytest.raises(ValidationError, match="Must be a PDF file"), + ): + client.rasterize_pdf(png_file) + + with ( + PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client, + pytest.raises( + ValidationError, match="List should have at most 1 item after validation" + ), + ): + client.rasterize_pdf([pdf_file, make_pdf_file(PdfRestFileID.generate())]) diff --git a/tests/test_summarize_pdf_text.py b/tests/test_summarize_pdf_text.py new file mode 100644 index 00000000..4263c488 --- /dev/null +++ b/tests/test_summarize_pdf_text.py @@ -0,0 +1,330 @@ +from __future__ import annotations + +import json + +import httpx +import pytest +from pydantic import ValidationError + +from pdfrest import AsyncPdfRestClient, PdfRestClient +from pdfrest.models import ( + PdfRestFile, + PdfRestFileBasedResponse, + PdfRestFileID, + SummarizePdfTextResponse, +) +from pdfrest.models._internal import SummarizePdfTextPayload + +from .graphics_test_helpers import ( + ASYNC_API_KEY, + VALID_API_KEY, + build_file_info_payload, + make_pdf_file, +) + + +def _make_text_file(file_id: str) -> PdfRestFile: + return PdfRestFile.model_validate( + { + "id": file_id, + "name": "notes.txt", + "url": f"https://api.pdfrest.com/resource/{file_id}", + "type": "text/plain", + "size": 64, + "modified": "2024-01-01T00:00:00Z", + "scheduledDeletionTimeUtc": None, + } + ) + + +def test_summarize_payload_rejects_invalid_mime() -> None: + file_id = str(PdfRestFileID.generate()) + image_file = PdfRestFile.model_validate( + { + "id": file_id, + "name": "image.png", + "url": f"https://api.pdfrest.com/resource/{file_id}", + "type": "image/png", + "size": 10, + "modified": "2024-01-01T00:00:00Z", + "scheduledDeletionTimeUtc": None, + } + ) + + with pytest.raises( + ValidationError, match="Must be a PDF, Markdown, or plain text file" + ): + SummarizePdfTextPayload.model_validate({"files": [image_file]}) + + +def test_summarize_payload_invalid_page_range() -> None: + file_repr = make_pdf_file(PdfRestFileID.generate(1)) + + with pytest.raises( + ValidationError, match="The start page must be less than or equal to the end" + ): + SummarizePdfTextPayload.model_validate({"files": [file_repr], "pages": ["5-2"]}) + + +def test_summarize_text_json_success(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = _make_text_file(str(PdfRestFileID.generate(1))) + payload_dump = SummarizePdfTextPayload.model_validate( + { + "files": [input_file], + "target_word_count": 120, + "summary_format": "bullet_points", + "pages": ["1-3"], + "output_format": "plaintext", + "output_type": "json", + "output": "summary", + } + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/summarized-pdf-text": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "summary": "Key points...", + "inputId": str(input_file.id), + }, + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.summarize_text( + input_file, + target_word_count=120, + summary_format="bullet_points", + pages=["1-3"], + output_format="plaintext", + output="summary", + ) + + assert seen == {"post": 1} + assert isinstance(response, SummarizePdfTextResponse) + assert response.summary == "Key points..." + assert response.input_id == input_file.id + + +def test_summarize_text_to_file_success( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = _make_text_file(str(PdfRestFileID.generate(1))) + payload_dump = SummarizePdfTextPayload.model_validate( + { + "files": [input_file], + "target_word_count": 200, + "summary_format": "bullet_points", + "pages": ["2-last"], + "output_format": "plaintext", + "output_type": "file", + "output": "summary", + } + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + output_id = str(PdfRestFileID.generate()) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/summarized-pdf-text": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "outputId": output_id, + "inputId": str(input_file.id), + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + return httpx.Response( + 200, + json=build_file_info_payload(output_id, "summary.txt", "text/plain"), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.summarize_text_to_file( + input_file, + target_word_count=200, + summary_format="bullet_points", + pages=["2-last"], + output_format="plaintext", + output="summary", + ) + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.id == output_id + assert response.output_file.name == "summary.txt" + assert response.input_id == input_file.id + + +def test_summarize_text_to_file_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + payload_dump = SummarizePdfTextPayload.model_validate( + { + "files": [input_file], + "output_type": "file", + "output_format": "markdown", + "summary_format": "overview", + } + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + output_id = str(PdfRestFileID.generate()) + + captured_timeout: dict[str, float | dict[str, float] | None] = {} + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/summarized-pdf-text": + seen["post"] += 1 + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + captured_timeout["value"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + for key, value in payload_dump.items(): + assert payload[key] == value + assert payload["debug"] is True + return httpx.Response( + 200, + json={ + "outputId": output_id, + "inputId": str(input_file.id), + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + return httpx.Response( + 200, + json=build_file_info_payload(output_id, "summary.txt", "text/plain"), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.summarize_text_to_file( + input_file, + extra_query={"trace": "true"}, + extra_headers={"X-Debug": "sync"}, + extra_body={"debug": True}, + timeout=0.25, + ) + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.id == output_id + assert response.output_file.name == "summary.txt" + timeout_value = captured_timeout["value"] + assert timeout_value is not None + if isinstance(timeout_value, dict): + assert all( + component == pytest.approx(0.25) for component in timeout_value.values() + ) + else: + assert timeout_value == pytest.approx(0.25) + + +@pytest.mark.asyncio +async def test_async_summarize_text_success( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + payload_dump = SummarizePdfTextPayload.model_validate( + {"files": [input_file], "output_type": "json"} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/summarized-pdf-text": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + for key, value in payload_dump.items(): + assert payload[key] == value + return httpx.Response( + 200, + json={ + "summary": "Async summary", + "inputId": str(input_file.id), + }, + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.summarize_text(input_file) + + assert seen == {"post": 1} + assert isinstance(response, SummarizePdfTextResponse) + assert response.summary == "Async summary" + assert response.input_id == input_file.id + + +@pytest.mark.asyncio +async def test_async_summarize_text_to_file_success( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + payload_dump = SummarizePdfTextPayload.model_validate( + {"files": [input_file], "output_type": "file"} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + output_id = str(PdfRestFileID.generate()) + + seen: dict[str, int] = {"post": 0, "get": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/summarized-pdf-text": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + for key, value in payload_dump.items(): + assert payload[key] == value + return httpx.Response( + 200, + json={ + "outputId": output_id, + "inputId": str(input_file.id), + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + seen["get"] += 1 + return httpx.Response( + 200, + json=build_file_info_payload( + output_id, "async-summary.txt", "text/plain" + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.summarize_text_to_file(input_file) + + assert seen == {"post": 1, "get": 1} + assert isinstance(response, PdfRestFileBasedResponse) + assert response.output_file.id == output_id + assert response.input_id == input_file.id diff --git a/tests/test_translate_pdf_text.py b/tests/test_translate_pdf_text.py new file mode 100644 index 00000000..7cfe5c76 --- /dev/null +++ b/tests/test_translate_pdf_text.py @@ -0,0 +1,282 @@ +from __future__ import annotations + +import json +import re + +import httpx +import pytest +from pydantic import ValidationError + +from pdfrest import AsyncPdfRestClient, PdfRestClient +from pdfrest.models import ( + PdfRestFile, + PdfRestFileID, + TranslatePdfTextFileResponse, + TranslatePdfTextResponse, +) +from pdfrest.models._internal import TranslatePdfTextPayload + +from .graphics_test_helpers import ASYNC_API_KEY, VALID_API_KEY, make_pdf_file + +OUTPUT_LANGUAGE_ERROR = ( + "The provided 'output_language' language tag is invalid. Format 'output_language' " + "as a valid 2-3 character ISO 639 language code (e.g., 'en', 'es', 'fra'), " + "optionally with a script, alphabetic region, or numeric region (e.g., 'zh-Hant', " + "'eng-US', 'es-419'). See documentation for recommended formats." +) + + +def _make_markdown_file(file_id: str) -> PdfRestFile: + return PdfRestFile.model_validate( + { + "id": file_id, + "name": "notes.md", + "url": f"https://api.pdfrest.com/resource/{file_id}", + "type": "text/markdown", + "size": 64, + "modified": "2024-01-01T00:00:00Z", + "scheduledDeletionTimeUtc": None, + } + ) + + +def test_translate_payload_rejects_invalid_mime() -> None: + file_id = str(PdfRestFileID.generate()) + image_file = PdfRestFile.model_validate( + { + "id": file_id, + "name": "image.png", + "url": f"https://api.pdfrest.com/resource/{file_id}", + "type": "image/png", + "size": 10, + "modified": "2024-01-01T00:00:00Z", + "scheduledDeletionTimeUtc": None, + } + ) + + with pytest.raises( + ValidationError, match="Must be a PDF, Markdown, or plain text file" + ): + TranslatePdfTextPayload.model_validate( + {"files": [image_file], "output_language": "fr"} + ) + + +@pytest.mark.parametrize( + "output_language", + [ + pytest.param("en", id="language-2-letter"), + pytest.param("fra", id="language-3-letter"), + pytest.param("zh-Hant", id="script"), + pytest.param("eng-US", id="alpha-region"), + pytest.param("es-419", id="numeric-region"), + ], +) +def test_translate_payload_accepts_valid_output_language( + output_language: str, +) -> None: + file_repr = make_pdf_file(PdfRestFileID.generate(1)) + payload = TranslatePdfTextPayload.model_validate( + {"files": [file_repr], "output_language": output_language} + ) + + assert payload.output_language == output_language + + +@pytest.mark.parametrize( + "output_language", + [ + pytest.param("", id="empty"), + pytest.param("e", id="too-short"), + pytest.param("english", id="not-a-code"), + pytest.param("eng-USA", id="long-subtag"), + pytest.param("en-1234", id="long-numeric-region"), + pytest.param("en-US-extra", id="too-many-subtags"), + ], +) +def test_translate_payload_rejects_invalid_output_language( + output_language: str, +) -> None: + file_repr = make_pdf_file(PdfRestFileID.generate(1)) + with pytest.raises( + ValidationError, + match=re.escape(OUTPUT_LANGUAGE_ERROR), + ): + TranslatePdfTextPayload.model_validate( + {"files": [file_repr], "output_language": output_language} + ) + + +def test_translate_payload_requires_target_language() -> None: + file_repr = make_pdf_file(PdfRestFileID.generate(1)) + with pytest.raises(ValidationError): + TranslatePdfTextPayload.model_validate({"files": [file_repr]}) + + +def test_translate_pdf_text_json_success(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = _make_markdown_file(str(PdfRestFileID.generate(1))) + payload_dump = TranslatePdfTextPayload.model_validate( + { + "files": [input_file], + "output_language": "fr", + "pages": ["1-2"], + "output_format": "plaintext", + "output_type": "json", + "output": "translation", + } + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/translated-pdf-text": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + assert payload == payload_dump + return httpx.Response( + 200, + json={ + "translated_text": "Bonjour", + "inputId": str(input_file.id), + "source_languages": ["en"], + "output_language": "fr", + }, + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.translate_pdf_text( + input_file, + output_language="fr", + pages=["1-2"], + output_format="plaintext", + output="translation", + ) + + assert seen == {"post": 1} + assert isinstance(response, TranslatePdfTextResponse) + assert response.translated_text == "Bonjour" + assert response.source_languages == ["en"] + assert response.output_language == "fr" + assert response.input_id == input_file.id + + +def test_translate_pdf_text_request_customization( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(1)) + payload_dump = TranslatePdfTextPayload.model_validate( + { + "files": [input_file], + "output_language": "es", + "output_type": "file", + } + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + output_id = str(PdfRestFileID.generate()) + + captured_timeout: dict[str, float | dict[str, float] | None] = {} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/translated-pdf-text": + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + captured_timeout["value"] = request.extensions.get("timeout") + payload = json.loads(request.content.decode("utf-8")) + for key, value in payload_dump.items(): + assert payload[key] == value + assert payload["debug"] is True + return httpx.Response( + 200, + json={ + "outputUrl": f"https://api.pdfrest.com/resource/{output_id}?format=file", + "outputId": output_id, + "inputId": str(input_file.id), + "source_languages": ["en"], + "output_language": "es", + }, + ) + if request.method == "GET" and request.url.path == f"/resource/{output_id}": + assert request.url.params["format"] == "info" + assert request.url.params["trace"] == "true" + assert request.headers["X-Debug"] == "sync" + return httpx.Response( + 200, + json=_make_markdown_file(output_id).model_dump( + mode="json", by_alias=True + ), + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + with PdfRestClient(api_key=VALID_API_KEY, transport=transport) as client: + response = client.translate_pdf_text_to_file( + input_file, + output_language="es", + extra_query={"trace": "true"}, + extra_headers={"X-Debug": "sync"}, + extra_body={"debug": True}, + timeout=0.3, + ) + + assert isinstance(response, TranslatePdfTextFileResponse) + assert response.output_file.id == output_id + assert response.output_language == "es" + assert response.source_languages == ["en"] + timeout_value = captured_timeout["value"] + assert timeout_value is not None + if isinstance(timeout_value, dict): + assert all( + component == pytest.approx(0.3) for component in timeout_value.values() + ) + else: + assert timeout_value == pytest.approx(0.3) + + +@pytest.mark.asyncio +async def test_async_translate_pdf_text_success( + monkeypatch: pytest.MonkeyPatch, +) -> None: + monkeypatch.delenv("PDFREST_API_KEY", raising=False) + input_file = make_pdf_file(PdfRestFileID.generate(2)) + payload_dump = TranslatePdfTextPayload.model_validate( + {"files": [input_file], "output_language": "de", "output_type": "json"} + ).model_dump(mode="json", by_alias=True, exclude_none=True, exclude_unset=True) + + seen: dict[str, int] = {"post": 0} + + def handler(request: httpx.Request) -> httpx.Response: + if request.method == "POST" and request.url.path == "/translated-pdf-text": + seen["post"] += 1 + payload = json.loads(request.content.decode("utf-8")) + for key, value in payload_dump.items(): + assert payload[key] == value + return httpx.Response( + 200, + json={ + "translated_text": "Hallo", + "inputId": str(input_file.id), + "source_languages": ["en"], + "output_language": "de", + }, + ) + msg = f"Unexpected request {request.method} {request.url}" + raise AssertionError(msg) + + transport = httpx.MockTransport(handler) + async with AsyncPdfRestClient(api_key=ASYNC_API_KEY, transport=transport) as client: + response = await client.translate_pdf_text( + input_file, + output_language="de", + ) + + assert seen == {"post": 1} + assert isinstance(response, TranslatePdfTextResponse) + assert response.translated_text == "Hallo" + assert response.source_languages == ["en"] + assert response.output_language == "de" + assert response.input_id == input_file.id diff --git a/uv.lock b/uv.lock index ba0e7705..aa0a1f3c 100644 --- a/uv.lock +++ b/uv.lock @@ -439,6 +439,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/2c/e1/e6716421ea10d38022b952c159d5161ca1193197fb744506875fbb87ea7b/iniconfig-2.1.0-py3-none-any.whl", hash = "sha256:9deba5723312380e77435581c6bf4935c94cbfab9b1ed33ef8d238ea168eb760", size = 6050, upload-time = "2025-03-19T20:10:01.071Z" }, ] +[[package]] +name = "langcodes" +version = "3.5.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a9/75/f9edc5d72945019312f359e69ded9f82392a81d49c5051ed3209b100c0d2/langcodes-3.5.1.tar.gz", hash = "sha256:40bff315e01b01d11c2ae3928dd4f5cbd74dd38f9bd912c12b9a3606c143f731", size = 191084, upload-time = "2025-12-02T16:22:01.627Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/dd/c1/d10b371bcba7abce05e2b33910e39c33cfa496a53f13640b7b8e10bb4d2b/langcodes-3.5.1-py3-none-any.whl", hash = "sha256:b6a9c25c603804e2d169165091d0cdb23934610524a21d226e4f463e8e958a72", size = 183050, upload-time = "2025-12-02T16:21:59.954Z" }, +] + [[package]] name = "license-expression" version = "30.4.4" @@ -601,6 +610,7 @@ source = { editable = "." } dependencies = [ { name = "exceptiongroup" }, { name = "httpx" }, + { name = "langcodes" }, { name = "pydantic" }, ] @@ -626,6 +636,7 @@ dev = [ requires-dist = [ { name = "exceptiongroup", specifier = ">=1.3.0" }, { name = "httpx", specifier = ">=0.28.1" }, + { name = "langcodes", specifier = ">=3.4.0" }, { name = "pydantic", specifier = ">=2.12.0" }, ]