From c13aa95b1f73db9e19cfc26afc1e72470772287a Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Fri, 19 Jun 2026 15:36:09 +0200 Subject: [PATCH] refactor: polish logging across the codebase Standardize internal modules on getLogger(__name__), rebalance log levels (no-op platform-only calls error->warning), and trim Scrapy trace-debug noise ahead of the next major release. --- src/apify/_actor.py | 14 ++++++++------ src/apify/_charging.py | 5 ++++- src/apify/_proxy_configuration.py | 4 +++- src/apify/events/_apify_event_manager.py | 6 ++++-- src/apify/scrapy/middlewares/apify_proxy.py | 7 ------- src/apify/scrapy/pipelines/actor_dataset_push.py | 4 ---- src/apify/scrapy/requests.py | 12 ------------ src/apify/scrapy/scheduler.py | 9 +-------- .../storage_clients/_apify/_alias_resolving.py | 3 +-- .../_file_system/_key_value_store_client.py | 2 +- tests/unit/actor/test_actor_helpers.py | 6 +++--- tests/unit/actor/test_actor_lifecycle.py | 6 +++--- tests/unit/events/test_apify_event_manager.py | 2 +- tests/unit/scrapy/test_scheduler.py | 4 ++-- 14 files changed, 31 insertions(+), 53 deletions(-) diff --git a/src/apify/_actor.py b/src/apify/_actor.py index 20242a9a5..23c70e9d2 100644 --- a/src/apify/_actor.py +++ b/src/apify/_actor.py @@ -1162,7 +1162,7 @@ async def metamorph( custom_after_sleep: How long to sleep for after the metamorph, to wait for the container to be stopped. """ if not self.is_at_home(): - self.log.error('Actor.metamorph() is only supported when running on the Apify platform.') + self.log.warning('Actor.metamorph() is only supported when running on the Apify platform.') return if custom_after_sleep is None: @@ -1172,6 +1172,7 @@ async def metamorph( if not self.configuration.actor_run_id: raise RuntimeError('actor_run_id cannot be None when running on the Apify platform.') + self.log.debug(f'Metamorphing the Actor run into target Actor {target_actor_id!r}.') await self.apify_client.run(self.configuration.actor_run_id).metamorph( target_actor_id=target_actor_id, run_input=run_input, @@ -1198,7 +1199,7 @@ async def reboot( custom_after_sleep: How long to sleep for after the reboot, to wait for the container to be stopped. """ if not self.is_at_home(): - self.log.error('Actor.reboot() is only supported when running on the Apify platform.') + self.log.warning('Actor.reboot() is only supported when running on the Apify platform.') return if self._is_rebooting: @@ -1243,6 +1244,7 @@ async def safe_dispatch(listener: Any, data: Any) -> None: except TimeoutError: self.log.warning('Pre-reboot event listeners did not finish within timeout; proceeding with reboot') + self.log.debug('Rebooting the Actor run.') await self.apify_client.run(self.configuration.actor_run_id).reboot() except BaseException: # Reset the flag so that a failed or cancelled reboot can be retried. @@ -1277,7 +1279,7 @@ async def add_webhook(self, webhook: Webhook, *, idempotency_key: str | None = N ) if not self.is_at_home(): - self.log.error('Actor.add_webhook() is only supported when running on the Apify platform.') + self.log.warning('Actor.add_webhook() is only supported when running on the Apify platform.') return # If is_at_home() is True, config.actor_run_id is always set @@ -1454,9 +1456,9 @@ def _get_remaining_time(self) -> timedelta | None: return max(self.configuration.timeout_at - datetime.now(tz=UTC), timedelta(0)) self.log.warning( - 'Using `inherit` argument is only possible when the Actor' - ' is running on the Apify platform and when the timeout for the Actor run is set. ' - f'{self.is_at_home()=}, {self.configuration.timeout_at=}' + 'Using the `inherit` argument is only possible when the Actor is running on the Apify platform and ' + 'the timeout for the Actor run is set.', + extra={'is_at_home': self.is_at_home(), 'timeout_at': self.configuration.timeout_at}, ) return None diff --git a/src/apify/_charging.py b/src/apify/_charging.py index 0a23ad4ef..9bbdeb3e4 100644 --- a/src/apify/_charging.py +++ b/src/apify/_charging.py @@ -5,6 +5,7 @@ from dataclasses import dataclass from datetime import UTC, datetime from decimal import Decimal +from logging import getLogger from typing import TYPE_CHECKING, Literal, Protocol, TypedDict from pydantic import ConfigDict @@ -19,7 +20,6 @@ from apify_client._models import PricingPerEvent as ClientPricingPerEvent from apify._utils import ReentrantLock, docs_group, ensure_context -from apify.log import logger from apify.storages import Dataset if TYPE_CHECKING: @@ -29,6 +29,8 @@ from apify._configuration import Configuration +logger = getLogger(__name__) + charging_manager_ctx: ContextVar[ChargingManager | None] = ContextVar('charging_manager_ctx', default=None) """Holds the current `ChargingManager` instance, if any. @@ -450,6 +452,7 @@ async def charge(self, event_name: str, *, count: int = 1) -> ChargeResult: pass elif event_name in self._pricing_info: await self._client.run(self._actor_run_id).charge(event_name, count=charged_count) + logger.debug(f"Charged {charged_count} occurrence(s) of event '{event_name}'.") elif event_name in self._tier_priced_events: logger.warning( f"Event '{event_name}' is tier-priced and is not chargeable via the pay-per-event API." diff --git a/src/apify/_proxy_configuration.py b/src/apify/_proxy_configuration.py index fec4d4ddf..aaedb08ac 100644 --- a/src/apify/_proxy_configuration.py +++ b/src/apify/_proxy_configuration.py @@ -3,6 +3,7 @@ import json import re from dataclasses import dataclass, field +from logging import getLogger from re import Pattern from typing import TYPE_CHECKING, Any @@ -22,13 +23,14 @@ from apify._configuration import Configuration from apify._consts import ApifyEnvVars from apify._utils import docs_group -from apify.log import logger if TYPE_CHECKING: from apify_client import ApifyClientAsync from apify import Request +logger = getLogger(__name__) + APIFY_PROXY_VALUE_REGEX = re.compile(r'^[\w._~]+$') COUNTRY_CODE_REGEX = re.compile(r'^[A-Z]{2}$') # ISO 3166-2 subdivision codes are 1-3 uppercase alphanumeric characters, e.g. 'CA', 'NSW', '9' (Wien, AT-9) diff --git a/src/apify/events/_apify_event_manager.py b/src/apify/events/_apify_event_manager.py index c4bf6e6df..e7b21da8a 100644 --- a/src/apify/events/_apify_event_manager.py +++ b/src/apify/events/_apify_event_manager.py @@ -3,6 +3,7 @@ import asyncio import contextlib import time +from logging import getLogger from typing import TYPE_CHECKING, Annotated, Self import websockets.asyncio.client @@ -17,7 +18,6 @@ from apify._utils import docs_group from apify.events._types import DeprecatedEvent, EventMessage, SystemInfoEventData, UnknownEvent -from apify.log import logger if TYPE_CHECKING: from collections.abc import Generator @@ -28,6 +28,8 @@ from apify._configuration import Configuration +logger = getLogger(__name__) + event_data_adapter = TypeAdapter[EventMessage | DeprecatedEvent | UnknownEvent]( Annotated[EventMessage, Discriminator('name')] | DeprecatedEvent | UnknownEvent ) @@ -195,7 +197,7 @@ async def _handle_platform_message(self, message: str | bytes) -> None: return if isinstance(parsed_message, UnknownEvent): - logger.info( + logger.debug( f'Unknown message received: event_name={parsed_message.name}, event_data={parsed_message.data}' ) return diff --git a/src/apify/scrapy/middlewares/apify_proxy.py b/src/apify/scrapy/middlewares/apify_proxy.py index de061f202..7665bfcfa 100644 --- a/src/apify/scrapy/middlewares/apify_proxy.py +++ b/src/apify/scrapy/middlewares/apify_proxy.py @@ -72,7 +72,6 @@ async def process_request(self, request: Request) -> None: Raises: ValueError: If username and password are not provided in the proxy URL. """ - Actor.log.debug(f'ApifyHttpProxyMiddleware.process_request: request={request}') url = await self._get_new_proxy_url() if not (url.username and url.password): @@ -82,8 +81,6 @@ async def process_request(self, request: Request) -> None: basic_auth_header = get_basic_auth_header(url.username, url.password) request.headers[b'Proxy-Authorization'] = basic_auth_header - Actor.log.debug(f'ApifyHttpProxyMiddleware.process_request: updated request.meta={request.meta}') - def process_exception( self, request: Request, @@ -100,10 +97,6 @@ def process_exception( process_exception() methods of installed middleware, until no middleware is left and the default exception handling kicks in. """ - Actor.log.debug( - f'ApifyHttpProxyMiddleware.process_exception: request={request}, exception={exception}', - ) - if isinstance(exception, TunnelError): Actor.log.warning( f'ApifyHttpProxyMiddleware: TunnelError occurred for request="{request}", ' diff --git a/src/apify/scrapy/pipelines/actor_dataset_push.py b/src/apify/scrapy/pipelines/actor_dataset_push.py index b3a801874..0f38d4bc9 100644 --- a/src/apify/scrapy/pipelines/actor_dataset_push.py +++ b/src/apify/scrapy/pipelines/actor_dataset_push.py @@ -1,6 +1,5 @@ from __future__ import annotations -from logging import getLogger from typing import TYPE_CHECKING from itemadapter.adapter import ItemAdapter @@ -10,8 +9,6 @@ if TYPE_CHECKING: from scrapy import Item -logger = getLogger(__name__) - class ActorDatasetPushPipeline: """A Scrapy pipeline for pushing items to an Actor's default dataset. @@ -25,6 +22,5 @@ async def process_item( ) -> Item: """Pushes the provided Scrapy item to the Actor's default dataset.""" item_dict = ItemAdapter(item).asdict() - logger.debug(f'Pushing item={item_dict} to the dataset.') await Actor.push_data(item_dict) return item diff --git a/src/apify/scrapy/requests.py b/src/apify/scrapy/requests.py index 5a115cf66..8bf99ec52 100644 --- a/src/apify/scrapy/requests.py +++ b/src/apify/scrapy/requests.py @@ -61,8 +61,6 @@ def to_apify_request(scrapy_request: ScrapyRequest, spider: Spider) -> ApifyRequ logger.warning('Failed to convert to Apify request: Scrapy request must be a ScrapyRequest instance.') return None - logger.debug(f'to_apify_request was called (scrapy_request={scrapy_request})...') - # Configuration to behave as similarly as possible to Scrapy's default RFPDupeFilter. # # The body is stored twice on purpose: as `payload` (used for the extended unique key) and inside the serialized @@ -150,7 +148,6 @@ def to_apify_request(scrapy_request: ScrapyRequest, spider: Spider) -> ApifyRequ # a second base64 layer would only add ~33% overhead on the enqueue path. apify_request.user_data['scrapy_request'] = scrapy_request_json - logger.debug(f'scrapy_request was converted to the apify_request={apify_request}') return apify_request @@ -171,14 +168,10 @@ def to_scrapy_request(apify_request: ApifyRequest, spider: Spider) -> ScrapyRequ if not isinstance(cast('Any', apify_request), ApifyRequest): raise TypeError('apify_request must be an apify.Request instance') - logger.debug(f'to_scrapy_request was called (apify_request={apify_request})...') - # If the apify_request comes from the Scrapy if 'scrapy_request' in apify_request.user_data: # Deserialize the Scrapy ScrapyRequest from the apify_request by parsing the stored JSON and reconstructing # the Scrapy ScrapyRequest object from its dictionary representation. - logger.debug('Restoring the Scrapy ScrapyRequest from the apify_request...') - scrapy_request_json = apify_request.user_data['scrapy_request'] if not isinstance(scrapy_request_json, str): raise TypeError('the stored scrapy_request must be a string') @@ -194,8 +187,6 @@ def to_scrapy_request(apify_request: ApifyRequest, spider: Spider) -> ScrapyRequ if not isinstance(scrapy_request, ScrapyRequest): raise TypeError('scrapy_request must be an instance of the ScrapyRequest class') - logger.debug(f'Scrapy ScrapyRequest successfully reconstructed (scrapy_request={scrapy_request})...') - # Update the meta field with the meta field from the apify_request meta = scrapy_request.meta or {} meta.update({'apify_request_unique_key': apify_request.unique_key}) @@ -204,8 +195,6 @@ def to_scrapy_request(apify_request: ApifyRequest, spider: Spider) -> ScrapyRequ # If the apify_request comes directly from the Scrapy, typically start URLs. else: - logger.debug('Gonna create a new Scrapy ScrapyRequest (cannot be restored)') - scrapy_request = ScrapyRequest( url=apify_request.url, method=apify_request.method, @@ -222,5 +211,4 @@ def to_scrapy_request(apify_request: ApifyRequest, spider: Spider) -> ScrapyRequ if apify_request.user_data: scrapy_request.meta['userData'] = apify_request.user_data - logger.debug(f'an apify_request was converted to the scrapy_request={scrapy_request}') return scrapy_request diff --git a/src/apify/scrapy/scheduler.py b/src/apify/scrapy/scheduler.py index 0df47d503..0646d3d6f 100644 --- a/src/apify/scrapy/scheduler.py +++ b/src/apify/scrapy/scheduler.py @@ -134,17 +134,14 @@ def enqueue_request(self, request: Request) -> bool: Returns: True if the request was successfully enqueued, False otherwise. """ - logger.debug(f'ApifyScheduler.enqueue_request was called (scrapy_request={request})...') - if not isinstance(self.spider, Spider): raise TypeError('self.spider must be an instance of the Spider class') apify_request = to_apify_request(request, spider=self.spider) if apify_request is None: - logger.error(f'Request {request} could not be converted to Apify request.') + logger.warning(f'Request {request} could not be converted to Apify request; skipping it.') return False - logger.debug(f'Converted to apify_request: {apify_request}') if not isinstance(self._rq, RequestQueue): raise TypeError('self._rq must be an instance of the RequestQueue class') @@ -156,7 +153,6 @@ def enqueue_request(self, request: Request) -> bool: logger.exception('Failed to enqueue the request to the request queue.') raise - logger.debug(f'rq.add_request result: {result}') return not bool(result.was_already_present) def next_request(self) -> Request | None: @@ -165,7 +161,6 @@ def next_request(self) -> Request | None: Returns: The next request, or None if there are no more requests. """ - logger.debug('next_request called...') if not isinstance(self._rq, RequestQueue): raise TypeError('self._rq must be an instance of the RequestQueue class') @@ -177,7 +172,6 @@ def next_request(self) -> Request | None: logger.exception('Failed to fetch the next request from the request queue.') raise - logger.debug(f'Fetched apify_request: {apify_request}') if apify_request is None: return None @@ -206,5 +200,4 @@ def next_request(self) -> Request | None: if scrapy_request is None: return None - logger.debug(f'Converted to scrapy_request: {scrapy_request}') return scrapy_request diff --git a/src/apify/storage_clients/_apify/_alias_resolving.py b/src/apify/storage_clients/_apify/_alias_resolving.py index 56e13ebbd..39fa3b995 100644 --- a/src/apify/storage_clients/_apify/_alias_resolving.py +++ b/src/apify/storage_clients/_apify/_alias_resolving.py @@ -1,6 +1,5 @@ from __future__ import annotations -import logging from asyncio import Lock from functools import cached_property from logging import getLogger @@ -225,7 +224,7 @@ async def store_mapping(self, storage_id: str) -> None: alias_map[self._storage_key] = storage_id if not self._configuration.is_at_home: - logging.getLogger(__name__).debug( + logger.debug( 'AliasResolver storage limited retention is only supported on Apify platform. Storage is not exported.' ) return diff --git a/src/apify/storage_clients/_file_system/_key_value_store_client.py b/src/apify/storage_clients/_file_system/_key_value_store_client.py index 09d7194ab..bfce86a74 100644 --- a/src/apify/storage_clients/_file_system/_key_value_store_client.py +++ b/src/apify/storage_clients/_file_system/_key_value_store_client.py @@ -122,7 +122,7 @@ async def _create_missing_metadata_for_input_file(key: str, record_path: Path) - try: content = await asyncio.to_thread(record_path.read_bytes) except FileNotFoundError: - logger.warning(f'Input file disparaged on path: "{record_path}"') + logger.warning(f'Input file disappeared from path: "{record_path}"') return # Figure out the metadata from the file content diff --git a/tests/unit/actor/test_actor_helpers.py b/tests/unit/actor/test_actor_helpers.py index 5d943904e..78c567c6b 100644 --- a/tests/unit/actor/test_actor_helpers.py +++ b/tests/unit/actor/test_actor_helpers.py @@ -138,7 +138,7 @@ async def test_metamorph_fails_locally(caplog: pytest.LogCaptureFixture) -> None matching = [r for r in caplog.records if 'Actor.metamorph()' in r.message] assert len(matching) == 1 - assert matching[0].levelname == 'ERROR' + assert matching[0].levelname == 'WARNING' assert 'only supported when running on the Apify platform' in matching[0].message @@ -149,7 +149,7 @@ async def test_reboot_fails_locally(caplog: pytest.LogCaptureFixture) -> None: matching = [r for r in caplog.records if 'Actor.reboot()' in r.message] assert len(matching) == 1 - assert matching[0].levelname == 'ERROR' + assert matching[0].levelname == 'WARNING' assert 'only supported when running on the Apify platform' in matching[0].message @@ -165,7 +165,7 @@ async def test_add_webhook_fails_locally(caplog: pytest.LogCaptureFixture) -> No matching = [r for r in caplog.records if 'Actor.add_webhook()' in r.message] assert len(matching) == 1 - assert matching[0].levelname == 'ERROR' + assert matching[0].levelname == 'WARNING' assert 'only supported when running on the Apify platform' in matching[0].message diff --git a/tests/unit/actor/test_actor_lifecycle.py b/tests/unit/actor/test_actor_lifecycle.py index 0575a211b..d1bcdcbf6 100644 --- a/tests/unit/actor/test_actor_lifecycle.py +++ b/tests/unit/actor/test_actor_lifecycle.py @@ -164,14 +164,14 @@ async def test_fail_without_init_raises_runtime_error() -> None: await Actor().fail() -async def test_reboot_in_local_environment_logs_error_message( +async def test_reboot_in_local_environment_logs_warning_message( actor: _ActorType, caplog: pytest.LogCaptureFixture, ) -> None: - """Test that reboot() logs an error when not running on the Apify platform.""" + """Test that reboot() logs a warning when not running on the Apify platform.""" await actor.reboot() - # Check that the error message was logged + # Check that the warning message was logged assert 'Actor.reboot() is only supported when running on the Apify platform.' in caplog.text diff --git a/tests/unit/events/test_apify_event_manager.py b/tests/unit/events/test_apify_event_manager.py index 18928614c..8f7239668 100644 --- a/tests/unit/events/test_apify_event_manager.py +++ b/tests/unit/events/test_apify_event_manager.py @@ -355,7 +355,7 @@ def listener(data: Any) -> None: async def test_unknown_event_is_logged(monkeypatch: pytest.MonkeyPatch, caplog: pytest.LogCaptureFixture) -> None: """Test that unknown events are logged and not emitted.""" - caplog.set_level(logging.INFO, logger='apify') + caplog.set_level(logging.DEBUG, logger='apify') async with ( _platform_ws_server(monkeypatch) as (connected_ws_clients, client_connected), ApifyEventManager(Configuration.get_global_configuration()), diff --git a/tests/unit/scrapy/test_scheduler.py b/tests/unit/scrapy/test_scheduler.py index cfe5b48af..a7cc4445d 100644 --- a/tests/unit/scrapy/test_scheduler.py +++ b/tests/unit/scrapy/test_scheduler.py @@ -56,13 +56,13 @@ def test_enqueue_request_skips_non_serializable_request( scheduler: ApifyScheduler, caplog: pytest.LogCaptureFixture, ) -> None: - """A request that cannot be converted (non-serializable meta) is not enqueued: returns False and logs an error.""" + """A request that cannot be converted (non-serializable meta) is not enqueued: returns False and logs a warning.""" rq = cast('mock.MagicMock', scheduler._rq) # A set in `meta` is not JSON-serializable, so `to_apify_request` returns None. scrapy_request = Request(url='https://example.com', meta={'tags': {'a', 'b'}}) - with caplog.at_level(logging.ERROR, logger='apify.scrapy.scheduler'): + with caplog.at_level(logging.WARNING, logger='apify.scrapy.scheduler'): result = scheduler.enqueue_request(scrapy_request) assert result is False