diff --git a/README.md b/README.md index c937ebc..464c401 100644 --- a/README.md +++ b/README.md @@ -83,7 +83,7 @@ pdf_parser = PdfParser() env definitions: ```bash -# provider: auto | local | modelscope +# provider: auto | local | huggingface | modelscope export DEEPDOC_MODEL_PROVIDER=auto # shared model cache root (default: ~/.cache/deepdoc) @@ -93,7 +93,13 @@ export DEEPDOC_MODEL_HOME=/path/to/deepdoc-models export DEEPDOC_VISION_MODEL_DIR=/path/to/vision export DEEPDOC_XGB_MODEL_DIR=/path/to/xgb -# single combined ModelScope repo (all bundles in one repo) +# single combined Hugging Face repo (all bundles in one repo) +# (default: Xorbits/deepdoc) +export DEEPDOC_HUGGINGFACE_REPO=Xorbits/deepdoc +# optional shared revision (default: main) +export DEEPDOC_HUGGINGFACE_REVISION=main + +# optional single combined ModelScope repo (all bundles in one repo) # (default: Xorbits/deepdoc) export DEEPDOC_MODELSCOPE_REPO=Xorbits/deepdoc # optional shared revision (default: master) @@ -116,6 +122,12 @@ deepdoc-download-models python -m deepdoc.download_models ``` +This uses Hugging Face by default. To force ModelScope instead: + +```bash +deepdoc-download-models --provider modelscope +``` + If you want to override the cache location, set `DEEPDOC_MODEL_HOME`: ```bash diff --git a/deepdoc/common/model_store.py b/deepdoc/common/model_store.py index e8f6589..56acdc7 100644 --- a/deepdoc/common/model_store.py +++ b/deepdoc/common/model_store.py @@ -14,7 +14,7 @@ # limitations under the License. # -"""Model artifact resolution for local files or ModelScope downloads.""" +"""Model artifact resolution for local files or remote model downloads.""" from __future__ import annotations @@ -29,20 +29,23 @@ GLOBAL_MODELSCOPE_REPO_ENV = "DEEPDOC_MODELSCOPE_REPO" GLOBAL_MODELSCOPE_REVISION_ENV = "DEEPDOC_MODELSCOPE_REVISION" +GLOBAL_HUGGINGFACE_REPO_ENV = "DEEPDOC_HUGGINGFACE_REPO" +GLOBAL_HUGGINGFACE_REVISION_ENV = "DEEPDOC_HUGGINGFACE_REVISION" TOKENIZER_MODEL_DIR_ENV = "DEEPDOC_TOKENIZER_MODEL_DIR" def _normalize_provider(provider: str | None) -> str: normalized = (provider or os.getenv("DEEPDOC_MODEL_PROVIDER", "auto")).strip().lower() aliases = { + "hf": "huggingface", "ms": "modelscope", - "remote": "modelscope", + "remote": "huggingface", "filesystem": "local", "user": "local", } normalized = aliases.get(normalized, normalized) - if normalized not in {"auto", "local", "modelscope"}: - raise ValueError("Unsupported model provider '{}'. Use one of: auto, local, modelscope.".format(normalized)) + if normalized not in {"auto", "local", "huggingface", "modelscope"}: + raise ValueError("Unsupported model provider '{}'. Use one of: auto, local, huggingface, modelscope.".format(normalized)) return normalized @@ -71,10 +74,30 @@ class BundleSpec: subdir: str required_files: tuple[str, ...] local_dir_env: str - repo_env: str - repo_default: str - revision_env: str - revision_default: str = "master" + modelscope_repo_env: str + modelscope_repo_default: str + modelscope_revision_env: str + modelscope_revision_default: str = "master" + huggingface_repo_env: str = "" + huggingface_repo_default: str = "Xorbits/deepdoc" + huggingface_revision_env: str = "" + huggingface_revision_default: str = "main" + + @property + def repo_env(self) -> str: + return self.modelscope_repo_env + + @property + def repo_default(self) -> str: + return self.modelscope_repo_default + + @property + def revision_env(self) -> str: + return self.modelscope_revision_env + + @property + def revision_default(self) -> str: + return self.modelscope_revision_default BUNDLES: dict[str, BundleSpec] = { @@ -92,18 +115,24 @@ class BundleSpec: "tsr.onnx", ), local_dir_env="DEEPDOC_VISION_MODEL_DIR", - repo_env="DEEPDOC_MODELSCOPE_VISION_REPO", - repo_default="Xorbits/deepdoc", - revision_env="DEEPDOC_MODELSCOPE_VISION_REVISION", + modelscope_repo_env="DEEPDOC_MODELSCOPE_VISION_REPO", + modelscope_repo_default="Xorbits/deepdoc", + modelscope_revision_env="DEEPDOC_MODELSCOPE_VISION_REVISION", + huggingface_repo_env="DEEPDOC_HUGGINGFACE_VISION_REPO", + huggingface_repo_default="Xorbits/deepdoc", + huggingface_revision_env="DEEPDOC_HUGGINGFACE_VISION_REVISION", ), "xgb": BundleSpec( name="xgb", subdir="xgb", required_files=("updown_concat_xgb.model",), local_dir_env="DEEPDOC_XGB_MODEL_DIR", - repo_env="DEEPDOC_MODELSCOPE_XGB_REPO", - repo_default="Xorbits/deepdoc", - revision_env="DEEPDOC_MODELSCOPE_XGB_REVISION", + modelscope_repo_env="DEEPDOC_MODELSCOPE_XGB_REPO", + modelscope_repo_default="Xorbits/deepdoc", + modelscope_revision_env="DEEPDOC_MODELSCOPE_XGB_REVISION", + huggingface_repo_env="DEEPDOC_HUGGINGFACE_XGB_REPO", + huggingface_repo_default="Xorbits/deepdoc", + huggingface_revision_env="DEEPDOC_HUGGINGFACE_XGB_REVISION", ), } @@ -114,11 +143,11 @@ def _resolve_modelscope_repo_id(spec: BundleSpec) -> tuple[str, bool]: Precedence: 1) per-bundle env (e.g. DEEPDOC_MODELSCOPE_VISION_REPO) 2) shared env (DEEPDOC_MODELSCOPE_REPO) - 3) per-bundle default (spec.repo_default) + 3) per-bundle default (spec.modelscope_repo_default) Returns (repo_id, use_shared_download_dir). """ - explicit = os.getenv(spec.repo_env) + explicit = os.getenv(spec.modelscope_repo_env) if explicit and explicit.strip(): repo_id = explicit.strip() shared_repo = os.getenv(GLOBAL_MODELSCOPE_REPO_ENV) @@ -129,16 +158,16 @@ def _resolve_modelscope_repo_id(spec: BundleSpec) -> tuple[str, bool]: if shared_repo and shared_repo.strip(): return shared_repo.strip(), True - repo_id = spec.repo_default.strip() + repo_id = spec.modelscope_repo_default.strip() # If all bundle defaults point at the same repo, treat it as a combined repo and share the download directory. - default_repos = {bundle.repo_default.strip() for bundle in BUNDLES.values()} + default_repos = {bundle.modelscope_repo_default.strip() for bundle in BUNDLES.values()} use_shared_dir = len(default_repos) == 1 and repo_id in default_repos return repo_id, use_shared_dir def _resolve_modelscope_revision(spec: BundleSpec) -> str: """Resolve ModelScope revision with an optional shared default.""" - explicit = os.getenv(spec.revision_env) + explicit = os.getenv(spec.modelscope_revision_env) if explicit and explicit.strip(): return explicit.strip() @@ -146,7 +175,37 @@ def _resolve_modelscope_revision(spec: BundleSpec) -> str: if shared and shared.strip(): return shared.strip() - return spec.revision_default + return spec.modelscope_revision_default + + +def _resolve_huggingface_repo_id(spec: BundleSpec) -> tuple[str, bool]: + explicit = os.getenv(spec.huggingface_repo_env) + if explicit and explicit.strip(): + repo_id = explicit.strip() + shared_repo = os.getenv(GLOBAL_HUGGINGFACE_REPO_ENV) + use_shared_dir = bool(shared_repo and shared_repo.strip() and shared_repo.strip() == repo_id) + return repo_id, use_shared_dir + + shared_repo = os.getenv(GLOBAL_HUGGINGFACE_REPO_ENV) + if shared_repo and shared_repo.strip(): + return shared_repo.strip(), True + + repo_id = spec.huggingface_repo_default.strip() + default_repos = {bundle.huggingface_repo_default.strip() for bundle in BUNDLES.values()} + use_shared_dir = len(default_repos) == 1 and repo_id in default_repos + return repo_id, use_shared_dir + + +def _resolve_huggingface_revision(spec: BundleSpec) -> str: + explicit = os.getenv(spec.huggingface_revision_env) + if explicit and explicit.strip(): + return explicit.strip() + + shared = os.getenv(GLOBAL_HUGGINGFACE_REVISION_ENV) + if shared and shared.strip(): + return shared.strip() + + return spec.huggingface_revision_default def _slugify_repo_path(value: str) -> str: @@ -159,6 +218,11 @@ def _modelscope_shared_download_dir(model_home: str | None, repo_id: str, revisi return base.joinpath("modelscope", _slugify_repo_path(repo_id), _slugify_repo_path(revision)) +def _huggingface_shared_download_dir(model_home: str | None, repo_id: str, revision: str) -> Path: + base = _model_home_path(model_home) + return base.joinpath("huggingface", _slugify_repo_path(repo_id), _slugify_repo_path(revision)) + + def _validate_bundle_dir(spec: BundleSpec, base_dir: Path) -> tuple[bool, list[str]]: missing = [name for name in spec.required_files if not base_dir.joinpath(name).exists()] return not missing, missing @@ -196,6 +260,15 @@ def _import_modelscope_snapshot_download(): raise RuntimeError("ModelScope provider requires the 'modelscope' package. Install it or switch DEEPDOC_MODEL_PROVIDER=local.") from exc +def _import_huggingface_snapshot_download(): + try: + from huggingface_hub import snapshot_download # type: ignore + + return snapshot_download + except Exception as exc: # pragma: no cover - import behavior depends on runtime env + raise RuntimeError("Hugging Face provider requires the 'huggingface-hub' package. Install it or switch DEEPDOC_MODEL_PROVIDER=local.") from exc + + def _download_modelscope_repo(*, repo_id: str, revision: str, target_dir: Path, offline: bool) -> Path: snapshot_download = _import_modelscope_snapshot_download() @@ -238,6 +311,71 @@ def _download_modelscope_repo(*, repo_id: str, revision: str, target_dir: Path, return resolved_snapshot +def _download_huggingface_repo(*, repo_id: str, revision: str, target_dir: Path, offline: bool) -> Path: + snapshot_download = _import_huggingface_snapshot_download() + + if not repo_id: + raise RuntimeError( + "Hugging Face repo id is empty. Set {} or a bundle-specific env like {}.".format( + GLOBAL_HUGGINGFACE_REPO_ENV, + "DEEPDOC_HUGGINGFACE_VISION_REPO", + ) + ) + + target_dir.mkdir(parents=True, exist_ok=True) + + signature = inspect.signature(snapshot_download) + params = signature.parameters + kwargs: dict[str, object] = {"repo_id": repo_id} + + if "revision" in params: + kwargs["revision"] = revision + + if "cache_dir" in params: + kwargs["cache_dir"] = str(target_dir.parent) + + if "local_dir" in params: + kwargs["local_dir"] = str(target_dir) + + if "local_dir_use_symlinks" in params: + kwargs["local_dir_use_symlinks"] = False + + if "local_files_only" in params: + kwargs["local_files_only"] = offline + + snapshot_root = snapshot_download(**kwargs) + resolved_snapshot = Path(snapshot_root).expanduser().resolve() + logging.info("Downloaded Hugging Face repo %s@%s to %s", repo_id, revision, resolved_snapshot) + return resolved_snapshot + + +def _candidate_remote_providers(provider_name: str) -> list[str]: + if provider_name == "auto": + return ["huggingface", "modelscope"] + if provider_name in {"huggingface", "modelscope"}: + return [provider_name] + return [] + + +def _resolve_remote_location(provider_name: str, spec: BundleSpec, model_home: str | None, local_bundle_dir: Path) -> tuple[str, str, Path]: + if provider_name == "huggingface": + repo_id, use_shared_dir = _resolve_huggingface_repo_id(spec) + revision = _resolve_huggingface_revision(spec) + download_dir = _huggingface_shared_download_dir(model_home, repo_id, revision) if use_shared_dir else local_bundle_dir + return repo_id, revision, download_dir + + repo_id, use_shared_dir = _resolve_modelscope_repo_id(spec) + revision = _resolve_modelscope_revision(spec) + download_dir = _modelscope_shared_download_dir(model_home, repo_id, revision) if use_shared_dir else local_bundle_dir + return repo_id, revision, download_dir + + +def _download_remote_repo(provider_name: str, repo_id: str, revision: str, target_dir: Path, offline: bool) -> Path: + if provider_name == "huggingface": + return _download_huggingface_repo(repo_id=repo_id, revision=revision, target_dir=target_dir, offline=offline) + return _download_modelscope_repo(repo_id=repo_id, revision=revision, target_dir=target_dir, offline=offline) + + def resolve_bundle_dir( bundle: str, *, @@ -245,7 +383,7 @@ def resolve_bundle_dir( provider: str | None = None, offline: bool | None = None, ) -> str: - """Resolve a model bundle directory from local files or ModelScope.""" + """Resolve a model bundle directory from local files or remote providers.""" if bundle not in BUNDLES: raise ValueError(f"Unknown model bundle '{bundle}'. Expected one of: {', '.join(BUNDLES)}") @@ -262,18 +400,11 @@ def resolve_bundle_dir( roots_to_scan = [local_bundle_dir] - # Prefer reusing already-downloaded ModelScope artifacts from our stable - # `model_home/modelscope///...` location when using shared repos. - shared_download_dir: Path | None = None - shared_repo_id: str | None = None - shared_revision: str | None = None - use_shared_repo_dir = False - if provider_name in {"auto", "modelscope"} and not offline_mode: - shared_repo_id, use_shared_repo_dir = _resolve_modelscope_repo_id(spec) - if use_shared_repo_dir: - shared_revision = _resolve_modelscope_revision(spec) - shared_download_dir = _modelscope_shared_download_dir(model_home, shared_repo_id, shared_revision) - roots_to_scan.append(shared_download_dir) + if not offline_mode: + for remote_provider in _candidate_remote_providers(provider_name): + repo_id, revision, download_dir = _resolve_remote_location(remote_provider, spec, model_home, local_bundle_dir) + if download_dir not in roots_to_scan and download_dir != local_bundle_dir: + roots_to_scan.append(download_dir) discovered = _discover_bundle_dir(spec, roots_to_scan) if discovered: @@ -290,44 +421,37 @@ def resolve_bundle_dir( if provider_name == "local" or offline_mode: raise FileNotFoundError("Bundle '{}' was not found locally at {} and remote download is disabled. Disable DEEPDOC_OFFLINE or provide local model files.".format(spec.name, local_bundle_dir)) - repo_id = shared_repo_id - revision = shared_revision - if not repo_id or revision is None or not use_shared_repo_dir: - repo_id, use_shared_repo_dir = _resolve_modelscope_repo_id(spec) - revision = _resolve_modelscope_revision(spec) - download_dir = _modelscope_shared_download_dir(model_home, repo_id, revision) if use_shared_repo_dir else local_bundle_dir - else: - # We already computed the shared repo download dir above. - download_dir = shared_download_dir or _modelscope_shared_download_dir(model_home, repo_id, revision) - - snapshot_root = _download_modelscope_repo( - repo_id=repo_id, - revision=revision, - target_dir=download_dir, - offline=offline_mode, - ) - discovered = _discover_bundle_dir(spec, [local_bundle_dir, download_dir, snapshot_root]) - if discovered: - return str(discovered) + failures: list[str] = [] + for remote_provider in _candidate_remote_providers(provider_name): + repo_id, revision, download_dir = _resolve_remote_location(remote_provider, spec, model_home, local_bundle_dir) + try: + snapshot_root = _download_remote_repo( + remote_provider, + repo_id, + revision, + download_dir, + offline_mode, + ) + discovered = _discover_bundle_dir(spec, [local_bundle_dir, download_dir, snapshot_root]) + if discovered: + return str(discovered) + + failures.append( + "Downloaded {} repo '{}@{}' for bundle '{}' but could not locate the required files under '{}' or '{}'.".format( + "Hugging Face" if remote_provider == "huggingface" else "ModelScope", + repo_id, + revision, + spec.name, + download_dir, + snapshot_root, + ) + ) + except Exception as exc: + failures.append("{}: {}".format(remote_provider, exc)) + if provider_name != "auto": + raise - raise FileNotFoundError( - "Downloaded ModelScope repo '{}@{}' for bundle '{}' but could not locate the required files. " - "Expected the following files to be colocated under a single directory in the repo (e.g. '{}/'): {}. " - "Configured via {} / {} and {} / {}. " - "Searched under: {}, {}.".format( - repo_id, - revision, - spec.name, - spec.subdir, - ", ".join(spec.required_files), - spec.repo_env, - GLOBAL_MODELSCOPE_REPO_ENV, - spec.revision_env, - GLOBAL_MODELSCOPE_REVISION_ENV, - download_dir, - snapshot_root, - ) - ) + raise FileNotFoundError("Failed to resolve bundle '{}'. {}".format(spec.name, " ".join(failures))) def validate_bundle_dir(bundle: str, directory: str | Path) -> tuple[bool, list[str]]: diff --git a/deepdoc/config.py b/deepdoc/config.py index 1ba540f..cfecfc1 100644 --- a/deepdoc/config.py +++ b/deepdoc/config.py @@ -13,20 +13,21 @@ ) from .common.misc_utils import offline_mode_or_from_env -ProviderType = Literal["local", "modelscope", "auto"] +ProviderType = Literal["local", "huggingface", "modelscope", "auto"] def _normalize_provider(provider: str) -> ProviderType: normalized = provider.strip().lower() aliases = { + "hf": "huggingface", "ms": "modelscope", - "remote": "modelscope", + "remote": "huggingface", "filesystem": "local", "user": "local", } normalized = aliases.get(normalized, normalized) - if normalized not in {"local", "modelscope", "auto"}: - raise ValueError("Unsupported model provider '{}'. Use one of: local, modelscope, auto.".format(provider)) + if normalized not in {"local", "huggingface", "modelscope", "auto"}: + raise ValueError("Unsupported model provider '{}'. Use one of: local, huggingface, modelscope, auto.".format(provider)) return normalized # type: ignore[return-value] diff --git a/deepdoc/download_models.py b/deepdoc/download_models.py index c61d81e..a6ebe26 100644 --- a/deepdoc/download_models.py +++ b/deepdoc/download_models.py @@ -18,8 +18,8 @@ def _parse_args(argv: list[str]) -> argparse.Namespace: # (~/.cache/deepdoc unless DEEPDOC_MODEL_HOME is set). parser.add_argument( "--provider", - default="modelscope", - choices=("auto", "local", "modelscope"), + default="huggingface", + choices=("auto", "local", "huggingface", "modelscope"), help="Model provider to use (default: %(default)s).", ) parser.add_argument( @@ -72,7 +72,7 @@ def _parse_args(argv: list[str]) -> argparse.Namespace: return args -def download_all(*, provider: str = "modelscope", model_home: str | None = None, offline: bool = False) -> dict[str, str]: +def download_all(*, provider: str = "huggingface", model_home: str | None = None, offline: bool = False) -> dict[str, str]: """Download/cache all bundles into the configured cache directories.""" from deepdoc.common import model_store diff --git a/tests/test_model_store.py b/tests/test_model_store.py index 4e16733..05b8fca 100644 --- a/tests/test_model_store.py +++ b/tests/test_model_store.py @@ -112,6 +112,49 @@ def snapshot_download( self.assertEqual(calls[0]["repo"], "OtherOrg/vision-only") self.assertEqual(Path(calls[0]["local_dir"]).resolve(), (Path(tmp) / "vision").resolve()) + def test_resolve_from_huggingface_repo(self) -> None: + calls: list[dict[str, str | None | bool]] = [] + + def snapshot_download( + repo_id: str, + *, + revision: str | None = None, + cache_dir: str | None = None, + local_dir: str | None = None, + local_dir_use_symlinks: bool | None = None, + local_files_only: bool | None = None, + ) -> str: + calls.append( + { + "repo": repo_id, + "revision": revision, + "cache_dir": cache_dir, + "local_dir": local_dir, + "local_files_only": local_files_only, + } + ) + root = Path(local_dir) if local_dir else Path(cache_dir or ".") + root.mkdir(parents=True, exist_ok=True) + _create_combined_repo_layout(root) + return str(root) + + with tempfile.TemporaryDirectory() as tmp: + os.environ["DEEPDOC_HUGGINGFACE_REVISION"] = "main" + + with patch.object(ms, "_import_huggingface_snapshot_download", return_value=snapshot_download): + vision_dir = Path(ms.resolve_bundle_dir("vision", model_home=tmp, provider="huggingface", offline=False)) + xgb_dir = Path(ms.resolve_bundle_dir("xgb", model_home=tmp, provider="huggingface", offline=False)) + + expected_root = (Path(tmp) / "huggingface" / "Xorbits__deepdoc" / "main").resolve() + self.assertEqual(vision_dir.resolve(), (expected_root / "vision").resolve()) + self.assertEqual(xgb_dir.resolve(), (expected_root / "xgb").resolve()) + self.assertGreaterEqual(len(calls), 1) + for call in calls: + self.assertEqual(call["repo"], "Xorbits/deepdoc") + self.assertEqual(call["revision"], "main") + self.assertEqual(Path(call["local_dir"]).resolve(), expected_root) + self.assertFalse(call["local_files_only"]) + def test_auto_provider_discovers_pre_downloaded_shared_repo_without_downloading(self) -> None: """If models were previously downloaded into the shared repo dir, 'auto' should reuse them.""" @@ -132,6 +175,68 @@ def snapshot_download(*args, **kwargs) -> str: # pragma: no cover self.assertEqual(vision_dir.resolve(), (expected_root / "vision").resolve()) self.assertEqual(xgb_dir.resolve(), (expected_root / "xgb").resolve()) + def test_auto_provider_falls_back_to_modelscope_when_huggingface_download_fails(self) -> None: + hf_calls: list[dict[str, str | None]] = [] + ms_calls: list[dict[str, str | None]] = [] + + def hf_snapshot_download( + repo_id: str, + *, + revision: str | None = None, + cache_dir: str | None = None, + local_dir: str | None = None, + local_dir_use_symlinks: bool | None = None, + local_files_only: bool | None = None, + ) -> str: + hf_calls.append( + { + "repo": repo_id, + "revision": revision, + "cache_dir": cache_dir, + "local_dir": local_dir, + } + ) + raise RuntimeError("hf unavailable") + + def ms_snapshot_download( + model_id: str | None = None, + repo_id: str | None = None, + revision: str | None = None, + cache_dir: str | None = None, + local_dir: str | None = None, + local_dir_use_symlinks: bool | None = None, + local_files_only: bool | None = None, + ) -> str: + resolved_repo = model_id or repo_id + ms_calls.append( + { + "repo": resolved_repo, + "revision": revision, + "cache_dir": cache_dir, + "local_dir": local_dir, + } + ) + root = Path(local_dir) if local_dir else Path(cache_dir or ".") + root.mkdir(parents=True, exist_ok=True) + _create_combined_repo_layout(root) + return str(root) + + with tempfile.TemporaryDirectory() as tmp: + os.environ["DEEPDOC_HUGGINGFACE_REPO"] = "Xorbits/deepdoc" + os.environ["DEEPDOC_HUGGINGFACE_REVISION"] = "main" + os.environ[ms.GLOBAL_MODELSCOPE_REPO_ENV] = "Xorbits/deepdoc" + os.environ[ms.GLOBAL_MODELSCOPE_REVISION_ENV] = "v1" + + with ( + patch.object(ms, "_import_huggingface_snapshot_download", return_value=hf_snapshot_download), + patch.object(ms, "_import_modelscope_snapshot_download", return_value=ms_snapshot_download), + ): + vision_dir = Path(ms.resolve_bundle_dir("vision", model_home=tmp, provider="auto", offline=False)) + + self.assertEqual(len(hf_calls), 1) + self.assertEqual(len(ms_calls), 1) + self.assertEqual(Path(vision_dir).resolve(), (Path(tmp) / "modelscope" / "Xorbits__deepdoc" / "v1" / "vision").resolve()) + def test_resolve_tokenizer_dict_prefix_uses_packaged_dict_by_default(self) -> None: prefix = Path(ms.resolve_tokenizer_dict_prefix())