From 0b7f643d857c6ceca47ee5da9e5d72f5798b3ef7 Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Wed, 14 Jan 2026 03:51:17 -0800
Subject: [PATCH 1/7] Use -fno-sycl-id-queries-fit-in-int for indexing
 extension

---
 dpnp/backend/extensions/indexing/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/dpnp/backend/extensions/indexing/CMakeLists.txt b/dpnp/backend/extensions/indexing/CMakeLists.txt
index 78f6713bdf2..a6691f31f55 100644
--- a/dpnp/backend/extensions/indexing/CMakeLists.txt
+++ b/dpnp/backend/extensions/indexing/CMakeLists.txt
@@ -84,6 +84,7 @@ else()
     )
 endif()
 
+target_compile_options(${python_module_name} PUBLIC -fno-sycl-id-queries-fit-in-int)
 target_link_options(${python_module_name} PUBLIC -fsycl-device-code-split=per_kernel)
 
 if(DPNP_GENERATE_COVERAGE)

From 335c43418bc4ba73db3279632f8b161b4340faa3 Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Wed, 14 Jan 2026 03:52:35 -0800
Subject: [PATCH 2/7] Update third party tests

---
 .../core_tests/test_ndarray_adv_indexing.py   | 67 ++++++++++++++++++-
 1 file changed, 66 insertions(+), 1 deletion(-)

diff --git a/dpnp/tests/third_party/cupy/core_tests/test_ndarray_adv_indexing.py b/dpnp/tests/third_party/cupy/core_tests/test_ndarray_adv_indexing.py
index 6a635ee53b2..f4917393cd2 100644
--- a/dpnp/tests/third_party/cupy/core_tests/test_ndarray_adv_indexing.py
+++ b/dpnp/tests/third_party/cupy/core_tests/test_ndarray_adv_indexing.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import itertools
 
 import numpy
@@ -435,6 +437,15 @@ def test_invalid_adv_getitem(self):
                 a[self.indexes]
 
 
+class TestArrayBadDTypeIndexAdvGetitem:
+    @pytest.mark.parametrize("dtype", [object, "i,i", "float32", "str"])
+    def test_bad_dtype_adv_getitem(self, dtype):
+        # Test various bad dtypes, supported by CuPy or not.
+        a = cupy.arange(10)
+        with pytest.raises(IndexError, match="arrays used as indices"):
+            a[numpy.array([1, 2], dtype=dtype)]
+
+
 @testing.parameterize(
     {"shape": (0,), "indexes": ([False],)},
     {
@@ -950,6 +961,60 @@ class TestArrayAdvancedIndexingSetitemTranspose:
     def test_adv_setitem_transp(self, xp):
         shape = (2, 3, 4)
         a = xp.zeros(shape).transpose(0, 2, 1)
-        slices = (xp.array([1, 0]), slice(None), xp.array([2, 1]))
+        slices = (numpy.array([1, 0]), slice(None), numpy.array([2, 1]))
         a[slices] = 1
         return a
+
+
+class TestHugeArrays:
+    # These tests require a lot of memory
+    @testing.slow
+    def test_advanced(self):
+        try:
+            arr = cupy.ones((1, 2**30), dtype=cupy.int8)
+            idx = cupy.zeros(3, dtype=cupy.int32)
+            res = arr[idx, :]
+            # sanity check, we mostly care about it not crashing.
+            assert res.sum() == 3 * 2**30
+            del res
+
+            arr[idx, :] = cupy.array([[3], [3], [3]], dtype=cupy.int8)
+            # Check 3 got written (order may not be strictly guaranteed)
+            assert arr.sum() == 2**30 * 3
+        except MemoryError:
+            pytest.skip("out of memory in test.")
+
+    @testing.slow
+    def test_take_array(self):
+        try:
+            arr = cupy.ones((1, 2**32), dtype=cupy.int8)
+            arr[0, 2**30] = 0  # We should see each of these once
+            arr[0, -1] = 0
+            res = arr.take(cupy.array([0, 0]), axis=0)
+            # sanity check, we mostly care about it not crashing.
+            assert res.sum() == 2 * (2**32 - 2)
+        except MemoryError:
+            pytest.skip("out of memory in test.")
+
+    @testing.slow
+    def test_take_scalar(self):
+        try:
+            arr = cupy.ones((1, 2**32), dtype=cupy.int8)
+            arr[0, 2**30] = 0  # We should see each of these once
+            arr[0, -1] = 0
+            res = arr.take(0, axis=0)
+            # sanity check, we mostly care about it not crashing.
+            assert res.sum() == 2**32 - 2
+        except MemoryError:
+            pytest.skip("out of memory in test.")
+
+    @testing.slow
+    def test_choose(self):
+        try:
+            choices = cupy.zeros((2, 2**31), dtype=cupy.int8)
+            choices[1, :] = 1
+            res = choices[1, :].choose(choices)
+            # sanity check, we mostly care about it not crashing.
+            assert res.sum() == 2**31
+        except MemoryError:
+            pytest.skip("out of memory in test.")

From ec774a945fe3a88404c039605c8cd8edb45cf556 Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Wed, 14 Jan 2026 04:07:58 -0800
Subject: [PATCH 3/7] Update third_party/cupy/core_tests to keep in sync with
 the remote repo. The tests were updated there to support run with
 pytest-run-parallel

---
 .../cupy/core_tests/test_carray.py            | 41 +++++++------
 .../third_party/cupy/core_tests/test_core.py  | 42 +++++++++----
 .../cupy/core_tests/test_cub_reduction.py     |  7 ++-
 .../cupy/core_tests/test_dlpack.py            | 23 ++++---
 .../cupy/core_tests/test_ndarray.py           | 20 ++-----
 .../cupy/core_tests/test_ndarray_reduction.py | 53 ++++++++--------
 .../third_party/cupy/core_tests/test_raw.py   | 60 ++++++++++++-------
 .../cupy/core_tests/test_userkernel.py        | 16 ++---
 8 files changed, 148 insertions(+), 114 deletions(-)

diff --git a/dpnp/tests/third_party/cupy/core_tests/test_carray.py b/dpnp/tests/third_party/cupy/core_tests/test_carray.py
index b161ef49e6b..7146460fc40 100644
--- a/dpnp/tests/third_party/cupy/core_tests/test_carray.py
+++ b/dpnp/tests/third_party/cupy/core_tests/test_carray.py
@@ -1,4 +1,4 @@
-import unittest
+from __future__ import annotations
 
 import pytest
 
@@ -8,7 +8,7 @@
 pytest.skip("CArray is not supported", allow_module_level=True)
 
 
-class TestCArray(unittest.TestCase):
+class TestCArray:
 
     def test_size(self):
         x = cupy.arange(3).astype("i")
@@ -63,39 +63,38 @@ def test_getitem_idx(self):
         testing.assert_array_equal(y, x)
 
 
-@testing.parameterize(
-    {"size": 2**31 - 1024},
-    {"size": 2**31},
-    {"size": 2**31 + 1024},
-    {"size": 2**32 - 1024},
-    {"size": 2**32},
-    {"size": 2**32 + 1024},
+@pytest.mark.parametrize(
+    "size",
+    [2**31 - 1024, 2**31, 2**31 + 1024, 2**32 - 1024, 2**32, 2**32 + 1024],
 )
-@testing.slow
-class TestCArray32BitBoundary(unittest.TestCase):
+@pytest.mark.slow
+@pytest.mark.thread_unsafe(reason="too large allocations")
+class TestCArray32BitBoundary:
     # This test case is intended to confirm CArray indexing work correctly
     # with input/output arrays whose size is so large that it crosses the
     # 32-bit boundary (in terms of both number of elements and size in bytes).
     # This test requires approx. 8 GiB GPU memory to run.
     # See https://github.com/cupy/cupy/pull/882 for detailed discussions.
-
-    def tearDown(self):
-        # Free huge memory for slow test
+    def teardown_method(self):
         cupy.get_default_memory_pool().free_all_blocks()
 
     # HIP is known to fail with sizes > 2**32-1024
-    @unittest.skipIf(cupy.cuda.runtime.is_hip, "HIP does not support this")
-    def test(self):
+    @pytest.mark.skipif(
+        cupy.cuda.runtime.is_hip, reason="HIP does not support this"
+    )
+    def test(self, size):
         # Elementwise
-        a = cupy.full((1, self.size), 7, dtype=cupy.int8)
+        a = cupy.full((1, size), 7, dtype=cupy.int8)
         # Reduction
         result = a.sum(axis=0, dtype=cupy.int8)
         # Explicitly specify the dtype to absorb Linux/Windows difference.
-        assert result.sum(dtype=cupy.int64) == self.size * 7
+        assert result.sum(dtype=cupy.int64) == size * 7
 
     # HIP is known to fail with sizes > 2**32-1024
-    @unittest.skipIf(cupy.cuda.runtime.is_hip, "HIP does not support this")
-    def test_assign(self):
-        a = cupy.zeros(self.size, dtype=cupy.int8)
+    @pytest.mark.skipif(
+        cupy.cuda.runtime.is_hip, reason="HIP does not support this"
+    )
+    def test_assign(self, size):
+        a = cupy.zeros(size, dtype=cupy.int8)
         a[-1] = 1.0
         assert a.sum() == 1
diff --git a/dpnp/tests/third_party/cupy/core_tests/test_core.py b/dpnp/tests/third_party/cupy/core_tests/test_core.py
index 9bcf5ae721c..c959b4f2495 100644
--- a/dpnp/tests/third_party/cupy/core_tests/test_core.py
+++ b/dpnp/tests/third_party/cupy/core_tests/test_core.py
@@ -8,6 +8,10 @@
 
 import dpnp as cupy
 from dpnp.tests.third_party.cupy import testing
+from dpnp.tests.third_party.cupy.testing._protocol_helpers import (
+    DummyObjectWithCudaArrayInterface,
+    DummyObjectWithCuPyGetNDArray,
+)
 
 
 class TestSize(unittest.TestCase):
@@ -37,6 +41,7 @@ def test_size_axis_error(self, dtype):
 
     @testing.numpy_cupy_equal()
     @testing.slow
+    # @pytest.mark.thread_unsafe(reason="Allocation too large.")
     def test_size_huge(self, xp):
         a = xp.ndarray(2**32, "b")  # 4 GiB
         return xp.size(a)
@@ -95,33 +100,44 @@ def test_cupy_ndarray(self, dtype):
         for v in (arr, (arr, arr)):
             assert cupy.min_scalar_type(v) is arr.dtype
 
-
-@testing.parameterize(
-    *testing.product(
-        {
-            "cxx": (None, "--std=c++14"),
-        }
+    @pytest.mark.parametrize(
+        "cupy_like",
+        [
+            DummyObjectWithCuPyGetNDArray,
+            DummyObjectWithCudaArrayInterface,
+        ],
     )
-)
-@pytest.mark.skip("compiling cupy headers are not supported")
-class TestCuPyHeaders(unittest.TestCase):
+    def test_cupy_likes_and_nested(self, cupy_like):
+        arr = cupy.array([[-1, 1]], dtype="int8")
 
-    def setUp(self):
+        obj = cupy_like(arr)
+        assert cupy.min_scalar_type(obj) is arr.dtype
+        if cupy_like is DummyObjectWithCuPyGetNDArray:
+            # __cupy_get_ndarray__ path currently assumes .shape and .dtype
+            obj.shape = arr.shape
+            obj.dtype = arr.dtype
+        assert cupy.min_scalar_type([obj, obj]) is arr.dtype
+
+
+@pytest.mark.skip("compiling cupy headers are not supported")
+class TestCuPyHeaders:
+    def setup_method(self):
         self.temporary_cache_dir_context = test_raw.use_temporary_cache_dir()
         self.cache_dir = self.temporary_cache_dir_context.__enter__()
         self.header = "\n".join(
             ["#include <" + h + ">" for h in core._cupy_header_list]
         )
 
-    def tearDown(self):
+    def teardown_method(self):
         self.temporary_cache_dir_context.__exit__(*sys.exc_info())
 
-    def test_compiling_core_header(self):
+    @pytest.mark.parametrize("cxx", (None, "--std=c++17"))
+    def test_compiling_core_header(self, cxx):
         code = r"""
         extern "C" __global__ void _test_ker_() { }
         """
         code = self.header + code
-        options = () if self.cxx is None else (self.cxx,)
+        options = () if cxx is None else (cxx,)
         ker = cupy.RawKernel(
             code, "_test_ker_", options=options, backend="nvrtc"
         )
diff --git a/dpnp/tests/third_party/cupy/core_tests/test_cub_reduction.py b/dpnp/tests/third_party/cupy/core_tests/test_cub_reduction.py
index 2983c9a152f..2adcbfe6ed3 100644
--- a/dpnp/tests/third_party/cupy/core_tests/test_cub_reduction.py
+++ b/dpnp/tests/third_party/cupy/core_tests/test_cub_reduction.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import sys
 import unittest
 from itertools import combinations
@@ -19,7 +21,6 @@
 # This test class and its children below only test if CUB backend can be used
 # or not; they don't verify its correctness as it's already extensively covered
 # by existing tests
-@unittest.skipIf(_environment.get_cub_path() is None, "CUB not found")
 class CubReductionTestBase(unittest.TestCase):
     """
     Note: call self.can_use() when arrays are already allocated, otherwise
@@ -27,9 +28,11 @@ class CubReductionTestBase(unittest.TestCase):
     """
 
     def setUp(self):
+        if _environment.get_cub_path() is None:
+            pytest.skip("CUB not found")
         if cupy.cuda.runtime.is_hip:
             if _environment.get_hipcc_path() is None:
-                self.skipTest("hipcc is not found")
+                pytest.skip("hipcc is not found")
 
         self.can_use = cupy._core._cub_reduction._can_use_cub_block_reduction
 
diff --git a/dpnp/tests/third_party/cupy/core_tests/test_dlpack.py b/dpnp/tests/third_party/cupy/core_tests/test_dlpack.py
index 80715257f9b..2df8cbfc5f7 100644
--- a/dpnp/tests/third_party/cupy/core_tests/test_dlpack.py
+++ b/dpnp/tests/third_party/cupy/core_tests/test_dlpack.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import dpctl
 import dpctl.tensor._dlpack as dlp
 import numpy
@@ -60,6 +62,8 @@ class TestNewDLPackConversion:
     def pool(self, request):
         self.memory = request.param
         if self.memory == "managed":
+            # if cuda.runtime.is_hip:
+            #     pytest.skip("HIP does not support managed memory")
             old_pool = cupy.get_default_memory_pool()
             new_pool = cuda.MemoryPool(cuda.malloc_managed)
             cuda.set_allocator(new_pool.malloc)
@@ -201,6 +205,8 @@ def test_conversion_device_to_cpu(self):
     @pytest.mark.skip("due to dpctl-2213")
     def test_stream(self):
         allowed_streams = ["null", True]
+        # if not cuda.runtime.is_hip:
+        #     allowed_streams.append("ptds")
 
         # stream order is automatically established via DLPack protocol
         for src_s in [self._get_stream(s) for s in allowed_streams]:
@@ -226,18 +232,18 @@ class TestDLTensorMemory:
 
     @pytest.fixture
     def pool(self):
-        pass
+        # old_pool = cupy.get_default_memory_pool()
+        # pool = cupy.cuda.MemoryPool()
+        # cupy.cuda.set_allocator(pool.malloc)
 
-    #     old_pool = cupy.get_default_memory_pool()
-    #     pool = cupy.cuda.MemoryPool()
-    #     cupy.cuda.set_allocator(pool.malloc)
+        # yield pool
 
-    # yield pool
-
-    #     pool.free_all_blocks()
-    #     cupy.cuda.set_allocator(old_pool.malloc)
+        # pool.free_all_blocks()
+        # cupy.cuda.set_allocator(old_pool.malloc)
+        pass
 
     @pytest.mark.parametrize("max_version", [None, (1, 0)])
+    # @pytest.mark.thread_unsafe(reason="modifies pool and tracks allocations")
     def test_deleter(self, pool, max_version):
         # memory is freed when tensor is deleted, as it's not consumed
         array = cupy.empty(10)
@@ -252,6 +258,7 @@ def test_deleter(self, pool, max_version):
         # assert pool.n_free_blocks() == 1
 
     @pytest.mark.parametrize("max_version", [None, (1, 0)])
+    # @pytest.mark.thread_unsafe(reason="modifies pool and tracks allocations")
     def test_deleter2(self, pool, max_version):
         # memory is freed when array2 is deleted, as tensor is consumed
         array = cupy.empty(10)
diff --git a/dpnp/tests/third_party/cupy/core_tests/test_ndarray.py b/dpnp/tests/third_party/cupy/core_tests/test_ndarray.py
index 200a29d2926..d782eb9f41e 100644
--- a/dpnp/tests/third_party/cupy/core_tests/test_ndarray.py
+++ b/dpnp/tests/third_party/cupy/core_tests/test_ndarray.py
@@ -246,6 +246,11 @@ def test_copy_multi_device_non_contiguous_K(self):
     # See cupy/cupy#5004
     @pytest.mark.skip("RawKernel() is not supported")
     @testing.multi_gpu(2)
+    # @pytest.mark.xfail(
+    #     runtime.is_hip,
+    #     reason='ROCm may work differently in async D2D copy with streams')
+    # @pytest.mark.thread_unsafe(
+    #     reason="order is unclear multithread. Also, hard crash in threaded!")
     def test_copy_multi_device_with_stream(self):
         # Kernel that takes long enough then finally writes values.
         src = _test_copy_multi_device_with_stream_src
@@ -430,21 +435,6 @@ def test_cuda_array_interface_stream(self):
                 assert iface["stream"] == stream.ptr
 
 
-@pytest.mark.skip("CUDA interface is not supported")
-class TestNdarrayCudaInterfaceNoneCUDA(unittest.TestCase):
-
-    def setUp(self):
-        self.arr = cupy.zeros(shape=(2, 3), dtype=cupy.float64)
-
-    def test_cuda_array_interface_hasattr(self):
-        assert not hasattr(self.arr, "__cuda_array_interface__")
-
-    def test_cuda_array_interface_getattr(self):
-        with pytest.raises(AttributeError) as e:
-            getattr(self.arr, "__cuda_array_interface__")
-        assert "HIP" in str(e.value)
-
-
 @testing.parameterize(
     *testing.product(
         {
diff --git a/dpnp/tests/third_party/cupy/core_tests/test_ndarray_reduction.py b/dpnp/tests/third_party/cupy/core_tests/test_ndarray_reduction.py
index 80b3f92fefa..0c652759f2a 100644
--- a/dpnp/tests/third_party/cupy/core_tests/test_ndarray_reduction.py
+++ b/dpnp/tests/third_party/cupy/core_tests/test_ndarray_reduction.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import numpy
 import pytest
 
@@ -16,32 +18,31 @@ class TestArrayReduction:
 
     @pytest.fixture(scope="class")
     def exclude_cutensor(self):
+        # cuTENSOR seems to have issues in handling inf/nan in reduction-based
+        # routines, so we use this fixture to skip testing it
+        # self.old_routine_accelerators = _acc.get_routine_accelerators()
+        # self.old_reduction_accelerators = _acc.get_reduction_accelerators()
+
+        # rot_acc = self.old_routine_accelerators.copy()
+        # try:
+        #     rot_acc.remove(_acc.ACCELERATOR_CUTENSOR)
+        # except ValueError:
+        #     pass
+        # _acc.set_routine_accelerators(rot_acc)
+
+        # red_acc = self.old_reduction_accelerators.copy()
+        # try:
+        #     red_acc.remove(_acc.ACCELERATOR_CUTENSOR)
+        # except ValueError:
+        #     pass
+        # _acc.set_reduction_accelerators(red_acc)
+
+        # yield
+
+        # _acc.set_routine_accelerators(self.old_routine_accelerators)
+        # _acc.set_reduction_accelerators(self.old_reduction_accelerators)
         pass
 
-    #     # cuTENSOR seems to have issues in handling inf/nan in reduction-based
-    #     # routines, so we use this fixture to skip testing it
-    #     self.old_routine_accelerators = _acc.get_routine_accelerators()
-    #     self.old_reduction_accelerators = _acc.get_reduction_accelerators()
-
-    #     rot_acc = self.old_routine_accelerators.copy()
-    #     try:
-    #         rot_acc.remove(_acc.ACCELERATOR_CUTENSOR)
-    #     except ValueError:
-    #         pass
-    #     _acc.set_routine_accelerators(rot_acc)
-
-    #     red_acc = self.old_reduction_accelerators.copy()
-    #     try:
-    #         red_acc.remove(_acc.ACCELERATOR_CUTENSOR)
-    #     except ValueError:
-    #         pass
-    #     _acc.set_reduction_accelerators(red_acc)
-
-    #     yield
-
-    #     _acc.set_routine_accelerators(self.old_routine_accelerators)
-    #     _acc.set_reduction_accelerators(self.old_reduction_accelerators)
-
     @testing.for_all_dtypes()
     @testing.numpy_cupy_allclose(contiguous_check=False)
     def test_max_all(self, xp, dtype):
@@ -376,6 +377,10 @@ def test_zero_size(self, xp):
     )
 )
 @pytest.mark.skip("CUB reduction is not supported")
+# @pytest.mark.skipif(
+#     not cupy.cuda.cub.available, reason="The CUB routine is not enabled"
+# )
+# @pytest.mark.thread_unsafe(reason="unsafe setUp and AssertFunctionIsCalled.")
 class TestCubReduction:
 
     @pytest.fixture(autouse=True)
diff --git a/dpnp/tests/third_party/cupy/core_tests/test_raw.py b/dpnp/tests/third_party/cupy/core_tests/test_raw.py
index 480f6de6ae4..59581674a85 100644
--- a/dpnp/tests/third_party/cupy/core_tests/test_raw.py
+++ b/dpnp/tests/third_party/cupy/core_tests/test_raw.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import contextlib
 import io
 import os
@@ -352,6 +354,8 @@
 
 @contextlib.contextmanager
 def use_temporary_cache_dir():
+    # Note uses mock, so not thread-safe (except at class/method level)
+    # tempdir fixture could be used instead.
     target1 = "cupy.cuda.compiler.get_cache_dir"
     target2 = "cupy.cuda.compiler._empty_file_preprocess_cache"
     temp_cache = {}
@@ -386,9 +390,8 @@ def find_nvcc_ver():
     nvcc_ver_pattern = r"release (\d+\.\d+)"
     cmd = cupy.cuda.get_nvcc_path().split()
     cmd += ["--version"]
-    cache_ctx = use_temporary_cache_dir()
-    with cache_ctx as cache_path:
-        output = compiler._run_cc(cmd, cache_path, "nvcc")
+
+    output = compiler._run_cc(cmd, cupy.cuda.compiler.get_cache_dir(), "nvcc")
     match = re.search(nvcc_ver_pattern, output)
     assert match
 
@@ -404,19 +407,19 @@ class _TestRawBase:
     _nvrtc_ver = None
 
     def setUp(self):
-        if hasattr(self, "clean_up"):
+        if getattr(self, "clean_up", False):
             if cupy.cuda.runtime.is_hip:
                 # Clearing memo triggers recompiling kernels using name
                 # expressions in other tests, e.g. dot and matmul, which
                 # hits a nvrtc bug. See #5843, #5945 and #6725.
-                self.skipTest("Clearing memo hits a nvrtc bug in other tests")
+                pytest.skip("Clearing memo hits a nvrtc bug in other tests")
             _util.clear_memo()
         self.dev = cupy.cuda.runtime.getDevice()
         assert self.dev != 1
-        if not hasattr(self, "jitify"):
-            self.jitify = False
+
+        self.jitify = getattr(self, "jitify", False)
         if cupy.cuda.runtime.is_hip and self.jitify:
-            self.skipTest("Jitify does not support ROCm/HIP")
+            pytest.skip("Jitify does not support ROCm/HIP")
 
         self.temporary_cache_dir_context = use_temporary_cache_dir()
         self.in_memory_context = compile_in_memory(self.in_memory)
@@ -858,6 +861,7 @@ def test_cuDoubleComplex(self):
         ker((grid,), (block,), (a, b, out))
         assert (out == a + b).all()
 
+    @pytest.mark.thread_unsafe(reason="mutates global in RawModule")
     def test_const_memory(self):
         mod = cupy.RawModule(
             code=test_const_mem, backend=self.backend, jitify=self.jitify
@@ -1126,6 +1130,7 @@ def test_context_switch_RawModule6(self):
     @unittest.skipUnless(
         not cupy.cuda.runtime.is_hip, "only CUDA raises warning"
     )
+    @pytest.mark.thread_unsafe(reason="mutates global cache directory")
     def test_compile_kernel(self):
         kern = cupy.RawKernel(
             _test_compile_src,
@@ -1144,6 +1149,7 @@ def test_compile_kernel(self):
     @unittest.skipUnless(
         not cupy.cuda.runtime.is_hip, "only CUDA raises warning"
     )
+    @pytest.mark.thread_unsafe(reason="mutates global cache directory")
     def test_compile_module(self):
         module = cupy.RawModule(
             code=_test_compile_src,
@@ -1187,6 +1193,9 @@ class TestRaw(_TestRawBase, unittest.TestCase):
     {"backend": "nvrtc", "in_memory": True, "clean_up": True, "jitify": True},
 )
 @testing.slow
+@pytest.mark.thread_unsafe(
+    reason="Jitify seems to have problems, skip as largely unmaintained."
+)
 class TestRawWithJitify(_TestRawBase, unittest.TestCase):
     pass
 
@@ -1220,7 +1229,8 @@ class TestRawWithJitify(_TestRawBase, unittest.TestCase):
     )
 )
 @unittest.skipIf(
-    find_nvcc_ver() >= 12020, "fp16 header compatibility issue, see cupy#8412"
+    cupy.cuda.runtime.is_hip or find_nvcc_ver() >= 12020,
+    "fp16 header compatibility issue, see cupy#8412 (Skip on HIP)",
 )
 @unittest.skipUnless(
     9000 <= cupy.cuda.runtime.runtimeGetVersion(), "Requires CUDA 9.x or later"
@@ -1229,9 +1239,8 @@ class TestRawWithJitify(_TestRawBase, unittest.TestCase):
     60 <= int(cupy.cuda.device.get_compute_capability()),
     "Requires compute capability 6.0 or later",
 )
-@unittest.skipIf(cupy.cuda.runtime.is_hip, "Skip on HIP")
 class TestRawGridSync(unittest.TestCase):
-
+    @pytest.mark.thread_unsafe(reason="mutates global cache directory")
     def test_grid_sync_rawkernel(self):
         n = self.n
         with use_temporary_cache_dir():
@@ -1249,6 +1258,7 @@ def test_grid_sync_rawkernel(self):
             kern_grid_sync((grid,), (block,), (x1, x2, y, n**2))
             assert cupy.allclose(y, x1 + x2)
 
+    @pytest.mark.thread_unsafe(reason="mutates global cache directory")
     def test_grid_sync_rawmodule(self):
         n = self.n
         with use_temporary_cache_dir():
@@ -1309,7 +1319,6 @@ def test_grid_sync_rawmodule(self):
     cupy.cuda.runtime.is_hip, "HIP does not support enable_cooperative_groups"
 )
 class TestRawPicklable(unittest.TestCase):
-
     def setUp(self):
         self.temporary_dir_context = use_temporary_cache_dir()
         self.temp_dir = self.temporary_dir_context.__enter__()
@@ -1392,17 +1401,7 @@ def test_raw_picklable(self):
 # Recent CCCL has made Jitify cold-launch very slow, see the discussion
 # starting https://github.com/cupy/cupy/pull/8899#issuecomment-2613022424.
 # TODO(leofang): Further refactor the test suite?
-@testing.parameterize(
-    *testing.product(
-        {
-            "jitify": (False, True),
-        }
-    )
-)
-@unittest.skipIf(cupy.cuda.runtime.is_hip, "Jitify does not support ROCm/HIP")
-@testing.slow
-class TestRawJitify(unittest.TestCase):
-
+class _TestRawJitify:
     def setUp(self):
         self.temporary_dir_context = use_temporary_cache_dir()
         self.temp_dir = self.temporary_dir_context.__enter__()
@@ -1509,3 +1508,18 @@ def test_jitify5(self):
             with pytest.raises(cupy.cuda.compiler.CompileException) as ex:
                 self._helper(hdr, options=("-I" + self.temp_dir,))
             assert "cannot open source file" in str(ex.value)
+
+
+@unittest.skipIf(cupy.cuda.runtime.is_hip, "Jitify does not support ROCm/HIP")
+@testing.slow
+class TestRawJitifyNoJitify(_TestRawJitify, unittest.TestCase):
+    jitify = False
+
+
+@unittest.skipIf(cupy.cuda.runtime.is_hip, "Jitify does not support ROCm/HIP")
+@testing.slow
+@pytest.mark.thread_unsafe(
+    reason="Jitify seems to have problems, skip as largely unmaintained."
+)
+class TestRawJitifyJitify(_TestRawJitify, unittest.TestCase):
+    jitify = True
diff --git a/dpnp/tests/third_party/cupy/core_tests/test_userkernel.py b/dpnp/tests/third_party/cupy/core_tests/test_userkernel.py
index ab184c4939f..aa839f61e9b 100644
--- a/dpnp/tests/third_party/cupy/core_tests/test_userkernel.py
+++ b/dpnp/tests/third_party/cupy/core_tests/test_userkernel.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import unittest
 
 import numpy
@@ -252,12 +254,12 @@ class TestUserkernelScalar(unittest.TestCase):
     @testing.numpy_cupy_array_equal()
     def test_scalar(self, xp, dtype):
         x = testing.shaped_arange((2, 3, 4), xp, dtype)
+        y = numpy.array(self.value).astype(dtype)[()]
         if xp is numpy:
-            y = numpy.array(self.value).astype(dtype)
             return x + y
         else:
             kernel = cupy.ElementwiseKernel("T x, T y", "T z", "z = x + y")
-            return kernel(x, self.value)
+            return kernel(x, y)
 
 
 class TestUserkernelManualBlockSize(unittest.TestCase):
@@ -297,7 +299,6 @@ def _prep_texture(self):
             if dim == 3
             else (height, width) if dim == 2 else (width,)
         )
-        self.shape = shape
 
         # prepare input, output, and texture memory
         # self.data holds the data stored in the texture memory
@@ -307,7 +308,6 @@ def _prep_texture(self):
         )
         arr = CUDAarray(ch, width, height, depth)
         arr.copy_from(tex_data)
-        self.data = tex_data
 
         # create resource and texture descriptors
         res = ResourceDescriptor(runtime.cudaResourceTypeArray, cuArr=arr)
@@ -322,7 +322,7 @@ def _prep_texture(self):
         )
 
         # create a texture object
-        return TextureObject(res, tex)
+        return TextureObject(res, tex), tex_data, shape
 
     def _prep_kernel1D(self):
         return cupy.ElementwiseKernel(
@@ -370,7 +370,7 @@ def test_texture_input(self):
         width, height, depth = self.dimensions
         dim = 3 if depth != 0 else 2 if height != 0 else 1
 
-        texobj = self._prep_texture()
+        texobj, data, shape = self._prep_texture()
         ker = getattr(self, f"_prep_kernel{dim}D")()
 
         # prepare input
@@ -383,10 +383,10 @@ def test_texture_input(self):
             size *= depth
             args.append(height)
         in_arr = cupy.arange(size, dtype=cupy.float32)
-        in_arr = in_arr.reshape(self.shape)
+        in_arr = in_arr.reshape(shape)
         args[0] = in_arr
 
         # compute and validate output
         out_arr = ker(*args)
-        expected = in_arr + self.data
+        expected = in_arr + data
         testing.assert_allclose(out_arr, expected)

From 6a547b94193a931ca1f28b8414c070b074b0ea7a Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Wed, 14 Jan 2026 04:36:09 -0800
Subject: [PATCH 4/7] Enable supported test for error class in
 third_party/cupy/test_init.py scope

---
 dpnp/tests/third_party/cupy/test_init.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/dpnp/tests/third_party/cupy/test_init.py b/dpnp/tests/third_party/cupy/test_init.py
index dbda6010e12..0a841ba28b2 100644
--- a/dpnp/tests/third_party/cupy/test_init.py
+++ b/dpnp/tests/third_party/cupy/test_init.py
@@ -1,3 +1,5 @@
+from __future__ import annotations
+
 import operator
 import os
 import shutil
@@ -73,6 +75,7 @@ def test_available(self):
         assert available
 
 
+# @pytest.mark.thread_unsafe(reason="modifies environment variables")
 @pytest.mark.skip("dpnp.is_available() is not implemented")
 class TestNotAvailable(unittest.TestCase):
 
@@ -145,18 +148,17 @@ def test_bitwise_not_is_invert(self):
             assert xp.bitwise_not is xp.invert
 
 
-@pytest.mark.skip("dpnp.exceptions is not implemented")
 @testing.with_requires("numpy>=2.0")
 @pytest.mark.parametrize(
     "name",
     [
         "exceptions.AxisError",
-        "exceptions.ComplexWarning",
-        "exceptions.ModuleDeprecationWarning",
-        "exceptions.RankWarning",
-        "exceptions.TooHardError",
-        "exceptions.VisibleDeprecationWarning",
-        "linalg.LinAlgError",
+        # "exceptions.ComplexWarning",
+        # "exceptions.ModuleDeprecationWarning",
+        # "exceptions.RankWarning",
+        # "exceptions.TooHardError",
+        # "exceptions.VisibleDeprecationWarning",
+        # "linalg.LinAlgError",
     ],
 )
 def test_error_classes(name):

From d31c2deda4eb809b4412d03cd481a39aa6f468c8 Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Wed, 14 Jan 2026 04:46:07 -0800
Subject: [PATCH 5/7] Update third_party/cupy/testing/_helper.py

---
 .../tests/third_party/cupy/testing/_helper.py | 101 +++++++++++-------
 1 file changed, 65 insertions(+), 36 deletions(-)

diff --git a/dpnp/tests/third_party/cupy/testing/_helper.py b/dpnp/tests/third_party/cupy/testing/_helper.py
index 5414c7af2ca..eaec6683ca5 100644
--- a/dpnp/tests/third_party/cupy/testing/_helper.py
+++ b/dpnp/tests/third_party/cupy/testing/_helper.py
@@ -1,21 +1,20 @@
+from __future__ import annotations
+
 import contextlib
 import importlib.metadata
 import inspect
 import unittest
 import warnings
+from collections.abc import Callable
 from importlib.metadata import PackageNotFoundError
-from typing import Callable
 from unittest import mock
 
 import numpy
 
 import dpnp as cupy
-from dpnp.tests.third_party.cupy.testing._pytest_impl import is_available
 
 # from cupy._core import internal
-# import cupyx
-# import cupyx.scipy.sparse
-
+from dpnp.tests.third_party.cupy.testing._pytest_impl import is_available
 
 if is_available():
     import pytest
@@ -25,7 +24,7 @@
     _skipif = unittest.skipIf
 
 
-def with_requires(*requirements):
+def with_requires(*requirements: str) -> Callable[[Callable], Callable]:
     """Run a test case only when given requirements are satisfied.
 
     .. admonition:: Example
@@ -49,7 +48,7 @@ def with_requires(*requirements):
     return _skipif(not installed(*requirements), reason=msg)
 
 
-def installed(*specifiers):
+def installed(*specifiers: str) -> bool:
     """Returns True if the current environment satisfies the specified
     package requirement.
 
@@ -72,13 +71,13 @@ def installed(*specifiers):
     return True
 
 
-def numpy_satisfies(version_range):
+def numpy_satisfies(version_range: str) -> bool:
     """Returns True if numpy version satisfies the specified criteria.
 
     Args:
         version_range: A version specifier (e.g., `>=1.13.0`).
     """
-    return installed("numpy{}".format(version_range))
+    return installed(f"numpy{version_range}")
 
 
 def shaped_arange(shape, xp=cupy, dtype=numpy.float32, order="C", device=None):
@@ -162,45 +161,72 @@ def shaped_random(
     from uniform distribution over :math:`[0, scale)`
     with specified dtype.
     """
-    numpy.random.seed(seed)
+    rng = numpy.random.RandomState(seed)
     dtype = numpy.dtype(dtype)
     if dtype == "?":
-        a = numpy.random.randint(2, size=shape)
+        a = rng.randint(2, size=shape)
     elif dtype.kind == "c":
-        a = numpy.random.rand(*shape) + 1j * numpy.random.rand(*shape)
+        a = rng.rand(*shape) + 1j * rng.rand(*shape)
         a *= scale
     else:
-        a = numpy.random.rand(*shape) * scale
+        a = rng.rand(*shape) * scale
     return xp.asarray(a, dtype=dtype, order=order)
 
 
-# def shaped_sparse_random(
-#         shape, sp=cupyx.scipy.sparse, dtype=numpy.float32,
-#         density=0.01, format='coo', seed=0):
-#     """Returns an array filled with random values.
+def shaped_sparse_random(
+    shape, sp=None, dtype=numpy.float32, density=0.01, format="", seed=0
+):
+    """Returns an array filled with random values.
+
+    Args:
+        shape (tuple): Shape of returned sparse matrix.
+        sp (scipy.sparse or cupyx.scipy.sparse): Sparse matrix module to use.
+        dtype (dtype): Dtype of returned sparse matrix.
+        density (float): Density of returned sparse matrix.
+        format (str): Format of returned sparse matrix.
+        seed (int): Random seed.
+
+    Returns:
+        The sparse matrix with given shape, array module,
+    """
+    import cupyx.scipy.sparse
+    import scipy.sparse
+
+    if sp is None:
+        sp = cupyx.scipy.sparse
+    n_rows, n_cols = shape
+    a = scipy.sparse.random(n_rows, n_cols, density, random_state=seed).astype(
+        dtype
+    )
+
+    try:
+        return sp.coo_matrix(a).asformat(format)
+    except AttributeError:
+        raise ValueError(f"Module {sp} does not have the expected sparse APIs")
 
-#     Args:
-#         shape (tuple): Shape of returned sparse matrix.
-#         sp (scipy.sparse or cupyx.scipy.sparse): Sparse matrix module to use.
-#         dtype (dtype): Dtype of returned sparse matrix.
-#         density (float): Density of returned sparse matrix.
-#         format (str): Format of returned sparse matrix.
-#         seed (int): Random seed.
 
-#     Returns:
-#         The sparse matrix with given shape, array module,
-#     """
-#     import scipy.sparse
-#     n_rows, n_cols = shape
-#     numpy.random.seed(seed)
-#     a = scipy.sparse.random(n_rows, n_cols, density).astype(dtype)
+def shaped_linspace(start, stop, shape, xp=cupy, dtype=numpy.float32):
+    """Returns an array with given shape, array module, and dtype.
 
-#     if sp is cupyx.scipy.sparse:
-#         a = cupyx.scipy.sparse.coo_matrix(a)
-#     elif sp is not scipy.sparse:
-#         raise ValueError('Unknown module: {}'.format(sp))
+    Args:
+        start (int): The starting value.
+        stop (int): The end value.
+        shape (tuple of int): Shape of returned ndarray.
+        xp (numpy or cupy): Array module to use.
+        dtype (dtype): Dtype of returned ndarray.
 
-#     return a.asformat(format)
+    Returns:
+        numpy.ndarray or cupy.ndarray:
+    """
+    dtype = numpy.dtype(dtype)
+    size = numpy.prod(shape)
+    if dtype == "?":
+        start = max(start, 0)
+        stop = min(stop, 1)
+    elif dtype.kind == "u":
+        start = max(start, 0)
+    a = numpy.linspace(start, stop, size)
+    return xp.array(a.astype(dtype).reshape(shape))
 
 
 def generate_matrix(
@@ -276,6 +302,7 @@ def assert_warns(expected):
 
 
 class NumpyAliasTestBase(unittest.TestCase):
+
     @property
     def func(self):
         raise NotImplementedError()
@@ -290,6 +317,7 @@ def numpy_func(self):
 
 
 class NumpyAliasBasicTestBase(NumpyAliasTestBase):
+
     def test_argspec(self):
         f = inspect.signature
         assert f(self.cupy_func) == f(self.numpy_func)
@@ -304,6 +332,7 @@ def test_docstring(self):
 
 
 class NumpyAliasValuesTestBase(NumpyAliasTestBase):
+
     def test_values(self):
         assert self.cupy_func(*self.args) == self.numpy_func(*self.args)
 

From 3f7410f00867b515532989244ad3d4a50fd603fa Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Wed, 14 Jan 2026 04:50:49 -0800
Subject: [PATCH 6/7] Extend third_party/cupy/statistics_tests/test_order.py
 with testing of new methods

---
 .../cupy/statistics_tests/test_order.py       | 20 +++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/dpnp/tests/third_party/cupy/statistics_tests/test_order.py b/dpnp/tests/third_party/cupy/statistics_tests/test_order.py
index b990c1eeb2c..58eb7999acc 100644
--- a/dpnp/tests/third_party/cupy/statistics_tests/test_order.py
+++ b/dpnp/tests/third_party/cupy/statistics_tests/test_order.py
@@ -1,9 +1,14 @@
+from __future__ import annotations
+
 import warnings
 
 import numpy
 import pytest
 
 import dpnp as cupy
+
+# import cupy._core._accelerator as _acc
+# from cupy import cuda
 from dpnp.tests.third_party.cupy import testing
 
 _all_methods = (
@@ -11,11 +16,11 @@
     # 'averaged_inverted_cdf',      # TODO(takagi) Not implemented
     # 'closest_observation',        # TODO(takagi) Not implemented
     # 'interpolated_inverted_cdf',  # TODO(takagi) Not implemented
-    # 'hazen',                      # TODO(takagi) Not implemented
-    # 'weibull',                    # TODO(takagi) Not implemented
+    "hazen",
+    "weibull",
     "linear",
-    # 'median_unbiased',            # TODO(takagi) Not implemented
-    # 'normal_unbiased',            # TODO(takagi) Not implemented
+    "median_unbiased",
+    "normal_unbiased",
     "lower",
     "higher",
     "midpoint",
@@ -166,9 +171,8 @@ def test_percentile_out_of_range_q(self, dtype, method):
                 with pytest.raises(ValueError):
                     xp.percentile(a, q, axis=-1, method=method)
 
-    @testing.for_all_dtypes()
     @testing.for_all_dtypes(no_float16=True, no_bool=True, no_complex=True)
-    @testing.numpy_cupy_allclose()
+    @testing.numpy_cupy_allclose(rtol=1e-6)
     def test_quantile_defaults(self, xp, dtype, method):
         a = testing.shaped_random((2, 3, 8), xp, dtype)
         q = testing.shaped_random((3,), xp, scale=1)
@@ -391,12 +395,16 @@ def test_ptp_axis2(self, xp, dtype):
     @testing.for_float_dtypes()
     @testing.numpy_cupy_allclose()
     def test_ptp_nan(self, xp, dtype):
+        # if _acc.ACCELERATOR_CUTENSOR in _acc.get_routine_accelerators():
+        #     pytest.skip()
         a = xp.array([float("nan"), 1, -1], dtype)
         return xp.ptp(a)
 
     @testing.for_float_dtypes()
     @testing.numpy_cupy_allclose()
     def test_ptp_all_nan(self, xp, dtype):
+        # if _acc.ACCELERATOR_CUTENSOR in _acc.get_routine_accelerators():
+        #     pytest.skip()
         a = xp.array([float("nan"), float("nan")], dtype)
         return xp.ptp(a)
 

From 5e2e332965ec280c4ada993883f9466252baf3aa Mon Sep 17 00:00:00 2001
From: Anton Volkov <antonwolfy@gmail.com>
Date: Wed, 14 Jan 2026 05:30:12 -0800
Subject: [PATCH 7/7] Add PR to the changelog

---
 CHANGELOG.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index aa531908591..8418736671e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -40,6 +40,7 @@ Also, that release drops support for Python 3.9, making Python 3.10 the minimum
 * Unified public API definitions in `dpnp.linalg` and `dpnp.scipy` submodules [#2663](https://github.com/IntelPython/dpnp/pull/2663)
 * Aligned the signature of `dpnp.reshape` function with Python array API by making `shape` a required argument [#2673](https://github.com/IntelPython/dpnp/pull/2673)
 * Unified `dpnp` public API exports by consolidating function exports in `__init__.py` and removing wildcard imports [#2665](https://github.com/IntelPython/dpnp/pull/2665) [#2666](https://github.com/IntelPython/dpnp/pull/2666)
+* Compile indexing extension with `-fno-sycl-id-queries-fit-in-int` to support huge arrays [#2721](https://github.com/IntelPython/dpnp/pull/2721)
 
 ### Deprecated