Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 48 additions & 0 deletions AGENTS.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# AGENTS.md

## Cursor Cloud specific instructions

### Environment overview

This is **NVIDIA TensorRT-LLM** (`tensorrt_llm`), a C++/Python library for optimized LLM inference on NVIDIA GPUs. The codebase has two major layers:

- **C++ runtime** (`cpp/`) — kernels, batch manager, executor; compiled into `libtensorrt_llm.so` and Python bindings (`tensorrt_llm/bindings/`)
- **Python package** (`tensorrt_llm/`) — model definitions, LLM API, serving layer, quantization utilities

### GPU constraint

The Cloud Agent VM has **no NVIDIA GPU**. Inference, model building, and serving commands (`trtllm-serve`, `trtllm-build`, `trtllm-bench`) cannot run.

A Python-only stub for `tensorrt_llm/bindings/` is provided so that `import tensorrt_llm` works on CPU. Set `TRT_LLM_NO_LIB_INIT=1` to skip loading the compiled plugin `.so` files (which don't exist without a C++ build).

### Running unit tests without GPU

```bash
TRT_LLM_NO_LIB_INIT=1 python3 -m pytest \
tests/unittest/llmapi/test_reasoning_parser.py \
tests/unittest/llmapi/test_build_cache.py \
tests/unittest/others/test_mapping.py \
tests/unittest/trt/quantization/test_mode.py \
tests/unittest/others/test_kv_cache_manager.py \
tests/unittest/others/test_module.py \
-v
```

Tests that import `tests/unittest/utils/util.py` call `cuda.cuInit()` at import time and cannot run without GPU.

### Lint/format commands

```bash
pre-commit run --all-files # All 21 hooks
ruff check # Lint (auto_deploy + progressively enabled files)
ruff format --check # Format check
isort --check-only tensorrt_llm/ # Import sorting check
```

### Key project conventions

- Python formatting: **yapf** (pep8, 80 cols) for most files; **ruff** (100 cols) for `auto_deploy/` and progressively enabled files. See `pyproject.toml`.
- C++ code: **clang-format** (v16+).
- Commits must include DCO sign-off (`git commit -s`).
- PR titles follow [Conventional Commits](https://www.conventionalcommits.org/en/v1.0.0/).
- Config: `.pre-commit-config.yaml` (hooks), `pyproject.toml` (linting).
6 changes: 5 additions & 1 deletion scripts/build_wheel.py
Original file line number Diff line number Diff line change
Expand Up @@ -614,7 +614,11 @@ def get_pybind_lib():
) == 1, f"Exactly one pybind library should be present: {pybind_lib}"
return pybind_lib[0]

install_file(get_pybind_lib(), pkg_dir)
pybind_lib = get_pybind_lib()
pybind_dst_name = pybind_lib.name.replace("bindings.", "_C.", 1)
bindings_pkg_dir = pkg_dir / "bindings"
bindings_pkg_dir.mkdir(parents=True, exist_ok=True)
install_file(pybind_lib, bindings_pkg_dir / pybind_dst_name)
if not skip_stubs:
with working_directory(project_dir):
build_run(f"\"{venv_python}\" -m pip install pybind11-stubgen")
Expand Down
24 changes: 21 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,15 +96,16 @@ def has_ext_modules(self):
if on_windows:
package_data = [
'libs/th_common.dll', 'libs/tensorrt_llm.dll',
'libs/nvinfer_plugin_tensorrt_llm.dll', 'bindings.*.pyd', "include/**/*"
'libs/nvinfer_plugin_tensorrt_llm.dll', 'bindings/_C.*.pyd',
"include/**/*"
]
else:
package_data = [
'bin/executorWorker', 'libs/libtensorrt_llm.so', 'libs/libth_common.so',
'libs/libnvinfer_plugin_tensorrt_llm.so',
'libs/libtensorrt_llm_ucx_wrapper.so', 'libs/libdecoder_attention_0.so',
'libs/libtensorrt_llm_nixl_wrapper.so',
'libs/libdecoder_attention_1.so', 'bindings.*.so', "include/**/*"
'libs/libdecoder_attention_1.so', 'bindings/_C.*.so', "include/**/*"
]

package_data += [
Expand Down Expand Up @@ -184,17 +185,34 @@ def extract_from_precompiled(precompiled_location: str, package_data: List[str],
for file in wheel.filelist:
if file.filename.endswith(".py"):
continue
# Match against current package_data patterns
matched = False
for filename_pattern in package_data:
if fnmatch.fnmatchcase(file.filename,
f"tensorrt_llm/{filename_pattern}"):
matched = True
break
else:
# Also accept legacy top-level bindings .so and relocate it
if not matched and fnmatch.fnmatchcase(file.filename,
"tensorrt_llm/bindings.*"):
matched = True
if not matched:
continue
print(
f"Extracting and including {file.filename} from precompiled wheel."
)
wheel.extract(file)

# Relocate legacy top-level bindings .so into the bindings/ package
import glob as _glob
for legacy_so in _glob.glob("tensorrt_llm/bindings.*.so") + _glob.glob(
"tensorrt_llm/bindings.*.pyd"):
new_name = os.path.basename(legacy_so).replace("bindings.", "_C.", 1)
dest = os.path.join("tensorrt_llm", "bindings", new_name)
os.makedirs(os.path.dirname(dest), exist_ok=True)
os.rename(legacy_so, dest)
print(f"Relocated {legacy_so} -> {dest}")


use_precompiled: bool = os.getenv("TRTLLM_USE_PRECOMPILED") == "1"
precompiled_location: str = os.getenv("TRTLLM_PRECOMPILED_LOCATION")
Expand Down
34 changes: 17 additions & 17 deletions tensorrt_llm/_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,30 +59,30 @@ def _init(log_level: object = None) -> None:

logger.info("Starting TensorRT-LLM init.")

# load plugin lib
_load_plugin_lib()

# load FT decoder layer and torch custom ops
project_dir = str(Path(__file__).parent.absolute())
if platform.system() == "Windows":
ft_decoder_lib = project_dir + "/libs/th_common.dll"
else:
ft_decoder_lib = project_dir + "/libs/libth_common.so"
try:
# load plugin lib
_load_plugin_lib()

# load FT decoder layer and torch custom ops
project_dir = str(Path(__file__).parent.absolute())
if platform.system() == "Windows":
ft_decoder_lib = project_dir + "/libs/th_common.dll"
else:
ft_decoder_lib = project_dir + "/libs/libth_common.so"
torch.classes.load_library(ft_decoder_lib)
from ._torch.custom_ops import _register_fake

_register_fake()
except Exception as e:
msg = (
"\nFATAL: Decoding operators failed to load. This may be caused by an incompatibility "
"between PyTorch and TensorRT-LLM. Please rebuild and install TensorRT-LLM."
)
raise ImportError(str(e) + msg)

MpiComm.local_init()
MpiComm.local_init()

logger.info("TensorRT-LLM inited.")
logger.info("TensorRT-LLM inited.")
except OSError as e:
logger.warning(
f"Could not load TensorRT-LLM C++ libraries: {e}. "
"This is expected on CPU-only machines without an NVIDIA driver. "
"GPU-dependent features will not be available."
)


def default_net() -> Network:
Expand Down
3 changes: 3 additions & 0 deletions tensorrt_llm/bindings/BuildInfo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
"""CPU-only stub for tensorrt_llm.bindings.BuildInfo."""

ENABLE_MULTI_DEVICE = False
21 changes: 21 additions & 0 deletions tensorrt_llm/bindings/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
"""Python bindings for TensorRT-LLM's C++ runtime.

When the compiled pybind11 extension (``_C``) is available and loadable, every
symbol is re-exported from it and this package behaves identically to the
monolithic ``bindings.cpython-*.so`` that older builds produced.

When the extension cannot be loaded (no GPU driver, no compiled build, etc.)
a lightweight set of Python-only stubs is activated instead so that the rest
of the ``tensorrt_llm`` package can still be imported for linting, testing
pure-Python logic, and similar CPU-only workflows.
"""

try:
from ._C import * # noqa: F401, F403

_USING_STUBS = False
except ImportError:
from ._stubs import make_module_getattr as _make_module_getattr

__getattr__ = _make_module_getattr(__name__)
_USING_STUBS = True
56 changes: 56 additions & 0 deletions tensorrt_llm/bindings/_stubs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
"""Generic stub machinery for CPU-only bindings.

Every stub module in this package delegates to the same small set of
primitives defined here, so there is exactly one place to maintain.

* ``_StubMeta`` – metaclass whose ``__getattr__`` auto-vivifies new
stub classes for any attribute access (``SomeStub.Foo`` → new class).
* ``_Stub`` – base class that accepts arbitrary ``*args / **kwargs``.
* ``make_module_getattr`` – returns a module-level ``__getattr__`` that
creates a fresh ``_Stub`` subclass for every unknown name.
"""


class _StubMeta(type):
"""Metaclass that auto-generates a child stub for any attribute access.

Pybind11 enums expose ``__members__``; the stub returns an empty dict
so that decorators like ``mirror_pybind_enum`` see zero fields to check.
"""

def __getattr__(cls, name):
if name == "__members__":
return {}
if name.startswith("_"):
raise AttributeError(name)
child = _StubMeta(name, (_Stub, ), {})
setattr(cls, name, child)
return child


class _Stub(metaclass=_StubMeta):
"""Base stub: stores constructor kwargs as instance attributes."""

def __init__(self, *args, **kwargs):
for k, v in kwargs.items():
setattr(self, k, v)

def __init_subclass__(cls, **kw):
super().__init_subclass__(**kw)


def make_module_getattr(module_path):
"""Return a ``__getattr__`` suitable for a stub module.

Unknown names are resolved to unique ``_Stub`` subclasses that are
cached so repeated access returns the same class object (important
for identity checks and dict keys).
"""
cache = {}

def __getattr__(name):
if name not in cache:
cache[name] = _StubMeta(name, (_Stub, ), {})
return cache[name]

return __getattr__
26 changes: 26 additions & 0 deletions tensorrt_llm/bindings/executor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
"""CPU-only stub for tensorrt_llm.bindings.executor.

All types are auto-generated via ``_stubs.make_module_getattr`` except the
one class that is called at *import* time with specific static methods.
"""

from ._stubs import _Stub, make_module_getattr


class LookaheadDecodingConfig(_Stub):
"""Override: ``llm_args.py`` reads these defaults at class-definition time."""

@staticmethod
def get_default_lookahead_decoding_window():
return 4

@staticmethod
def get_default_lookahead_decoding_ngram():
return 3

@staticmethod
def get_default_lookahead_decoding_verification_set():
return 4


__getattr__ = make_module_getattr(__name__)
5 changes: 5 additions & 0 deletions tensorrt_llm/bindings/internal/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
"""CPU-only stub for tensorrt_llm.bindings.internal."""

from .._stubs import make_module_getattr

__getattr__ = make_module_getattr(__name__)
5 changes: 5 additions & 0 deletions tensorrt_llm/bindings/internal/runtime.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
"""CPU-only stub for tensorrt_llm.bindings.internal.runtime."""

from .._stubs import make_module_getattr

__getattr__ = make_module_getattr(__name__)
5 changes: 5 additions & 0 deletions tensorrt_llm/bindings/internal/userbuffers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
"""CPU-only stub for tensorrt_llm.bindings.internal.userbuffers."""

from .._stubs import make_module_getattr

__getattr__ = make_module_getattr(__name__)
14 changes: 9 additions & 5 deletions tensorrt_llm/profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,12 +120,16 @@ def __exit__(self, type, value, traceback):
pynvml.nvmlShutdown()


_device_get_memory_info_fn = None
if pynvml is not None:
with PyNVMLContext():
_device_get_memory_info_fn = partial(
pynvml.nvmlDeviceGetMemoryInfo,
version=pynvml.nvmlMemory_v2,
)
try:
with PyNVMLContext():
_device_get_memory_info_fn = partial(
pynvml.nvmlDeviceGetMemoryInfo,
version=pynvml.nvmlMemory_v2,
)
except pynvml.NVMLError:
pynvml = None


def host_memory_info(pid: Optional[int] = None) -> Tuple[int, int, int]:
Expand Down
2 changes: 2 additions & 0 deletions tests/unittest/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@ def pytest_runtest_protocol(item, nextitem):
import os

import torch
if not torch.cuda.is_available():
break
worker_count = int(os.environ.get('PYTEST_XDIST_WORKER_COUNT', 1))

if (torch.cuda.memory_reserved(0) + torch.cuda.memory_allocated(0)
Expand Down
Loading