Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Kernel&Prim] Fix IndexPutCudaKernel for thread safe and add index_put_double_grad #69095


Copy link

@HydrogenSulfate HydrogenSulfate commented Oct 31, 2024

PR Category

Operator Mechanism

PR Types

New features



Related PR: deepmodeling/deepmd-kit#4157

  1. 修复IndexPutCudaKernel线程不安全的+=加法,改为phi::CudaAtomicAdd,避免indices中含有重复的坐标导致结果不正确
  2. index_put添加到api.yaml里作为基础算子,而后在index_put_double_grad中复用该前向算子


由于gradient_checker不支持indices这一Tuple[Tensor, ..]输入类型,因此单测仅作覆盖率测试,精度测试与pytorch对比,如下所示(accumulate=False时前向计算的结果为赋值操作,具有不确定性,因此该前向结果不进行对比)


x.shape, indices.shape, value.shape baseline(pytorch) "+=" phi::CudaAtomicAdd
[16] [10] [10] 0.21 0.09 0.10(+0.01)
[16, 16] [20, 2] [20] 0.28 0.10 0.10(+0)
[12, 13, 14] [88, 1] [88, 13, 14] 0.22 0.17 0.23(+0.05)
[12, 13, 14] [88, 2] [88, 14] 0.28 0.17 0.17(+0)
[12, 13, 14] [88, 3] [88] 0.34 0.11 0.11(+0)
[12, 13, 14] [2184, 3] [2184] 0.48 0.10 0.14(+0.04)
[102, 103, 104] [1092624, 3] [1092624] 1.74 0.26 0.26(+0)



def test_inedx_put_grad_speed():
    import paddle
    from paddle.framework import core
    import numpy as np
    import torch
    import time

    place = 'gpu'
    accumulate = True
    from tqdm import trange
    for x_shape, indices_shape, value_shape in [
        ([16], [10], [10]),
        ([16, 16], [20, 2], [20]),
        ([12, 13, 14], [88, 1], [88, 13, 14]),
        ([12, 13, 14], [88, 2], [88, 14]),
        ([12, 13, 14], [88, 3], [88]),
        pd_list = []
        pt_list = []
        for i in trange(100):
            n_indices = indices_shape[0]
            index_dim_size = indices_shape[1] if len(indices_shape) > 1 else 1

            x_np = np.random.randn(*x_shape)
            indices_np = tuple(
                    np.random.randint(0, x_shape[i], [n_indices])
                    for i in range(max(index_dim_size, 1))
            value_np = np.random.randn(*value_shape).astype("float32")

            # run paddle
            x_pd = paddle.to_tensor(x_np.copy(), "float32", stop_gradient=False, place=place)
            indices_pd = [
                paddle.to_tensor(indice.copy(), "int64", stop_gradient=True, place=place)
                for indice in indices_np
            value_pd = paddle.to_tensor(value_np.copy(), "float32", stop_gradient=False, place=place)

            def paddle_forward(x_, i_, v_):
                return paddle.index_put(x_, i_, v_, accumulate=accumulate)

            t = time.perf_counter()
            paddle_forward(x_pd, indices_pd, value_pd)
            t = time.perf_counter() - t


            # run torch
            x_pt = torch.as_tensor(x_np, dtype=torch.float32, device='cuda' if place == 'gpu' else place).requires_grad_(True)
            indices_pt = [
                torch.as_tensor(indice, dtype=torch.int64, device='cuda' if place == 'gpu' else place).requires_grad_(False)
                for indice in indices_np
            value_pt = torch.as_tensor(value_np, dtype=torch.float32, device='cuda' if place == 'gpu' else place).requires_grad_(True)

            def torch_forward(x_, i_, v_):
                return torch.index_put(x_, i_, v_, accumulate=accumulate)

            t = time.perf_counter()
            torch_forward(x_pt, indices_pt, value_pt)
            t = time.perf_counter() - t


        print(x_shape, indices_shape, value_shape, f"{np.asarray(pd_list[20:]).mean() * 1000: .2f} ms")
        print(x_shape, indices_shape, value_shape, f"{np.asarray(pt_list[20:]).mean() * 1000: .2f} ms")


def test_index_put_fwd_bwd_double_bwd():
    import paddle
    from paddle.framework import core
    import numpy as np
    import torch

    for place in ['cpu', 'gpu']:
        for accumulate in [False, True]:
            for x_shape, indices_shape, value_shape in [
                ([16], [10], [10]),
                ([16, 16], [20, 2], [20]),
                ([12, 13, 14], [88, 1], [88, 13, 14]),
                ([12, 13, 14], [88, 2], [88, 14]),
                ([12, 13, 14], [88, 3], [88]),
                n_indices = indices_shape[0]
                index_dim_size = indices_shape[1] if len(indices_shape) > 1 else 1

                x_np = np.random.randn(*x_shape)
                indices_np = tuple(
                        np.random.randint(0, x_shape[i], [n_indices])
                        for i in range(max(index_dim_size, 1))
                value_np = np.random.randn(*value_shape).astype("float32")

                # run paddle
                x_pd = paddle.to_tensor(x_np.copy(), "float32", stop_gradient=False, place=place)
                indices_pd = [
                    paddle.to_tensor(indice.copy(), "int64", stop_gradient=True, place=place)
                    for indice in indices_np
                value_pd = paddle.to_tensor(value_np.copy(), "float32", stop_gradient=False, place=place)

                out_pd = paddle.index_put(x_pd, indices_pd, value_pd, accumulate=accumulate)
                out_pd = paddle.tanh(out_pd) #
                dout_np = np.random.randn(*out_pd.shape)

                dout_pd = paddle.to_tensor(dout_np.copy(), "float32", stop_gradient=False, place=place)
                dout_pd.stop_gradient = False

                dx_pd = paddle.grad(out_pd, x_pd, dout_pd, create_graph=True)[0] #
                ddx_np = np.random.randn(*dx_pd.shape)

                dvalue_pd = paddle.grad(out_pd, x_pd, dout_pd, create_graph=True)[0] #
                ddvalue_np = np.random.randn(*dvalue_pd.shape)

                ddx_pd = paddle.to_tensor(ddx_np.copy(), "float32", stop_gradient=False, place=place)
                ddvalue_pd = paddle.to_tensor(ddvalue_np.copy(), "float32", stop_gradient=False, place=place)
                ddout1_pd = paddle.grad(dx_pd, dout_pd, ddx_pd, create_graph=True)[0] #
                ddout2_pd = paddle.grad(dvalue_pd, dout_pd, ddvalue_pd, create_graph=True)[0] #

                # run torch
                x_pt = torch.as_tensor(x_np, dtype=torch.float32, device='cuda' if place == 'gpu' else place).requires_grad_(True)
                indices_pt = [
                    torch.as_tensor(indice, dtype=torch.int64, device='cuda' if place == 'gpu' else place).requires_grad_(False)
                    for indice in indices_np
                value_pt = torch.as_tensor(value_np, dtype=torch.float32, device='cuda' if place == 'gpu' else place).requires_grad_(True)

                out_pt = torch.index_put(x_pt, indices_pt, value_pt, accumulate=accumulate)
                out_pt = torch.tanh(out_pt)

                dout_pt = torch.as_tensor(dout_np, dtype=torch.float32, device='cuda' if place == 'gpu' else place).requires_grad_(True)
                dout_pt.stop_gradient = False

                dx_pt = torch.autograd.grad(out_pt, x_pt, dout_pt, create_graph=True)[0]

                dvalue_pt = torch.autograd.grad(out_pt, x_pt, dout_pt, create_graph=True)[0]

                ddx_pt = torch.as_tensor(ddx_np.copy(), dtype=torch.float32, device='cuda' if place == 'gpu' else place).requires_grad_(True)
                ddvalue_pt = torch.as_tensor(ddvalue_np.copy(), dtype=torch.float32, device='cuda' if place == 'gpu' else place).requires_grad_(True)
                ddout1_pt = torch.autograd.grad(dx_pt, dout_pt, ddx_pt, create_graph=True)[0]
                ddout2_pt = torch.autograd.grad(dvalue_pt, dout_pt, ddvalue_pt, create_graph=True)[0]

                # compare result
                ## output
                if accumulate:
                    np.testing.assert_allclose(out_pd.numpy(), out_pt.detach().cpu().numpy(), 1e-6, 1e-6)

                ## 1-order grad
                np.testing.assert_allclose(dx_pd.numpy(), dx_pt.detach().cpu().numpy(), 2e-6, 2e-6)
                np.testing.assert_allclose(dvalue_pd.numpy(), dvalue_pt.detach().cpu().numpy(), 2e-6, 2e-6)

                ## 2-order grad
                np.testing.assert_allclose(ddout1_pd.numpy(), ddout1_pt.detach().cpu().numpy(), 2e-6, 2e-6)
                np.testing.assert_allclose(ddout2_pd.numpy(), ddout2_pt.detach().cpu().numpy(), 2e-6, 2e-6)

def index_put_fwd_bwd_double_bwd_with_accumulate_false():
    import paddle
    from paddle.framework import core
    import numpy as np
    import torch


    x_shape = [3, 4]
    indices_np = [
        np.asarray([0, 1, 1, 2, 2]),
        np.asarray([1, 3, 2, 1, 0]),
    ] # N = 2, D = 4
    indices_shape = [indices_np[0].shape[0], len(indices_np)] # N x D
    n_indices = indices_shape[0]
    value_shape = [n_indices]

    x_np = np.random.randn(*x_shape)
    value_np = np.random.randn(*value_shape).astype("float32")

    for place in ['cpu', 'gpu']:
        for accumulate in [False, True]:
            # run paddle
            x_pd = paddle.to_tensor(x_np.copy(), "float32", stop_gradient=False, place=place)
            indices_pd = [
                paddle.to_tensor(indice.copy(), "int64", stop_gradient=True, place=place)
                for indice in indices_np
            value_pd = paddle.to_tensor(value_np.copy(), "float32", stop_gradient=False, place=place)

            out_pd = paddle.index_put(x_pd, indices_pd, value_pd, accumulate=accumulate)
            out_pd = paddle.tanh(out_pd) #
            dout_np = np.random.randn(*out_pd.shape)

            dout_pd = paddle.to_tensor(dout_np.copy(), "float32", stop_gradient=False, place=place)
            dout_pd.stop_gradient = False

            dx_pd = paddle.grad(out_pd, x_pd, dout_pd, create_graph=True)[0] #
            ddx_np = np.random.randn(*dx_pd.shape)

            dvalue_pd = paddle.grad(out_pd, x_pd, dout_pd, create_graph=True)[0] #
            ddvalue_np = np.random.randn(*dvalue_pd.shape)

            ddx_pd = paddle.to_tensor(ddx_np.copy(), "float32", stop_gradient=False, place=place)
            ddvalue_pd = paddle.to_tensor(ddvalue_np.copy(), "float32", stop_gradient=False, place=place)
            ddout1_pd = paddle.grad(dx_pd, dout_pd, ddx_pd, create_graph=True)[0] #
            ddout2_pd = paddle.grad(dvalue_pd, dout_pd, ddvalue_pd, create_graph=True)[0] #

            # run torch
            x_pt = torch.as_tensor(x_np.copy(), dtype=torch.float32, device='cuda' if place == 'gpu' else place).requires_grad_(True)
            indices_pt = [
                torch.as_tensor(indice.copy(), dtype=torch.int64, device='cuda' if place == 'gpu' else place).requires_grad_(False)
                for indice in indices_np
            value_pt = torch.as_tensor(value_np.copy(), dtype=torch.float32, device='cuda' if place == 'gpu' else place).requires_grad_(True)

            out_pt = torch.index_put(x_pt, indices_pt, value_pt, accumulate=accumulate)
            out_pt = torch.tanh(out_pt)

            dout_pt = torch.as_tensor(dout_np.copy(), dtype=torch.float32, device='cuda' if place == 'gpu' else place).requires_grad_(True)
            dout_pt.stop_gradient = False

            dx_pt = torch.autograd.grad(out_pt, x_pt, dout_pt, create_graph=True)[0]

            dvalue_pt = torch.autograd.grad(out_pt, x_pt, dout_pt, create_graph=True)[0]

            ddx_pt = torch.as_tensor(ddx_np.copy(), dtype=torch.float32, device='cuda' if place == 'gpu' else place).requires_grad_(True)
            ddvalue_pt = torch.as_tensor(ddvalue_np.copy(), dtype=torch.float32, device='cuda' if place == 'gpu' else place).requires_grad_(True)
            ddout1_pt = torch.autograd.grad(dx_pt, dout_pt, ddx_pt, create_graph=True)[0]
            ddout2_pt = torch.autograd.grad(dvalue_pt, dout_pt, ddvalue_pt, create_graph=True)[0]

            # compare result
            ## output
            np.testing.assert_allclose(out_pd.numpy(), out_pt.detach().cpu().numpy(), 1e-6, 1e-6)

            ## 1-order grad
            np.testing.assert_allclose(dx_pd.numpy(), dx_pt.detach().cpu().numpy(), 1e-6, 1e-6)
            np.testing.assert_allclose(dvalue_pd.numpy(), dvalue_pt.detach().cpu().numpy(), 1e-6, 1e-6)

            ## 2-order grad
            np.testing.assert_allclose(ddout1_pd.numpy(), ddout1_pt.detach().cpu().numpy(), 2e-6, 2e-6)
            np.testing.assert_allclose(ddout2_pd.numpy(), ddout2_pt.detach().cpu().numpy(), 2e-6, 2e-6)

if __name__ == "__main__":

Copy link

paddle-bot bot commented Oct 31, 2024

Your PR has been submitted. Thanks for your contribution!
Please wait for the result of CI firstly. See Paddle CI Manual for details.

@HydrogenSulfate HydrogenSulfate changed the title [Prim] Add index_put_double_grad [Kernel&Prim] Fix IndexPutCudaKernel for thread safe and add index_put_double_grad Nov 1, 2024
@HydrogenSulfate HydrogenSulfate changed the title [Kernel&Prim] Fix IndexPutCudaKernel for thread safe and add index_put_double_grad [Kernel&Prim] Fix IndexPutCudaKernel for thread safe and add index_put_double_grad Nov 1, 2024
@HydrogenSulfate HydrogenSulfate merged commit ba7343b into PaddlePaddle:develop Nov 4, 2024
28 checks passed
@HydrogenSulfate HydrogenSulfate deleted the add_index_put_double_grad branch November 4, 2024 02:51
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
None yet
None yet

Successfully merging this pull request may close these issues.

3 participants