Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Kernel&Prim] Fix IndexPutCudaKernel for thread safe and add index_put_double_grad #69095

Conversation

HydrogenSulfate
Copy link
Contributor

@HydrogenSulfate HydrogenSulfate commented Oct 31, 2024

PR Category

Operator Mechanism

PR Types

New features

Description

Pcard-75624

Related PR: deepmodeling/deepmd-kit#4157

  1. 修复IndexPutCudaKernel线程不安全的+=加法,改为phi::CudaAtomicAdd,避免indices中含有重复的坐标导致结果不正确
  2. index_put添加到api.yaml里作为基础算子,而后在index_put_double_grad中复用该前向算子

Note

由于gradient_checker不支持indices这一Tuple[Tensor, ..]输入类型,因此单测仅作覆盖率测试,精度测试与pytorch对比,如下所示(accumulate=False时前向计算的结果为赋值操作,具有不确定性,因此该前向结果不进行对比)
image

由于使用CudaAtomicAdd对性能可能有影响,测试结果如下(单位:毫秒,测试100次取后80次的平均值):

x.shape, indices.shape, value.shape baseline(pytorch) "+=" phi::CudaAtomicAdd
[16] [10] [10] 0.21 0.09 0.10(+0.01)
[16, 16] [20, 2] [20] 0.28 0.10 0.10(+0)
[12, 13, 14] [88, 1] [88, 13, 14] 0.22 0.17 0.23(+0.05)
[12, 13, 14] [88, 2] [88, 14] 0.28 0.17 0.17(+0)
[12, 13, 14] [88, 3] [88] 0.34 0.11 0.11(+0)
[12, 13, 14] [2184, 3] [2184] 0.48 0.10 0.14(+0.04)
[102, 103, 104] [1092624, 3] [1092624] 1.74 0.26 0.26(+0)

可以看到CudaAtomicAdd会略微增加计算耗时,但是不会导致算子比pytorch更慢

测试脚本如下

def test_inedx_put_grad_speed():
    import paddle
    from paddle.framework import core
    import numpy as np
    import torch
    import time

    core.set_prim_eager_enabled(True)
    core._set_prim_all_enabled(True)
    place = 'gpu'
    accumulate = True
    from tqdm import trange
    for x_shape, indices_shape, value_shape in [
        ([16], [10], [10]),
        ([16, 16], [20, 2], [20]),
        ([12, 13, 14], [88, 1], [88, 13, 14]),
        ([12, 13, 14], [88, 2], [88, 14]),
        ([12, 13, 14], [88, 3], [88]),
    ]:
        pd_list = []
        pt_list = []
        for i in trange(100):
            n_indices = indices_shape[0]
            index_dim_size = indices_shape[1] if len(indices_shape) > 1 else 1

            x_np = np.random.randn(*x_shape)
            indices_np = tuple(
                [
                    np.random.randint(0, x_shape[i], [n_indices])
                    for i in range(max(index_dim_size, 1))
                ]
            )
            value_np = np.random.randn(*value_shape).astype("float32")

            # run paddle
            x_pd = paddle.to_tensor(x_np.copy(), "float32", stop_gradient=False, place=place)
            indices_pd = [
                paddle.to_tensor(indice.copy(), "int64", stop_gradient=True, place=place)
                for indice in indices_np
            ]
            value_pd = paddle.to_tensor(value_np.copy(), "float32", stop_gradient=False, place=place)

            def paddle_forward(x_, i_, v_):
                return paddle.index_put(x_, i_, v_, accumulate=accumulate)

            paddle.device.cuda.synchronize()
            t = time.perf_counter()
            paddle_forward(x_pd, indices_pd, value_pd)
            paddle.device.cuda.synchronize()
            t = time.perf_counter() - t

            pd_list.append(t)

            # run torch
            x_pt = torch.as_tensor(x_np, dtype=torch.float32, device='cuda' if place == 'gpu' else place).requires_grad_(True)
            indices_pt = [
                torch.as_tensor(indice, dtype=torch.int64, device='cuda' if place == 'gpu' else place).requires_grad_(False)
                for indice in indices_np
            ]
            value_pt = torch.as_tensor(value_np, dtype=torch.float32, device='cuda' if place == 'gpu' else place).requires_grad_(True)

            def torch_forward(x_, i_, v_):
                return torch.index_put(x_, i_, v_, accumulate=accumulate)

            torch.cuda.synchronize()
            t = time.perf_counter()
            torch_forward(x_pt, indices_pt, value_pt)
            torch.cuda.synchronize()
            t = time.perf_counter() - t

            pt_list.append(t)

        print(x_shape, indices_shape, value_shape, f"{np.asarray(pd_list[20:]).mean() * 1000: .2f} ms")
        print(x_shape, indices_shape, value_shape, f"{np.asarray(pt_list[20:]).mean() * 1000: .2f} ms")

精度测试脚本如下:

def test_index_put_fwd_bwd_double_bwd():
    import paddle
    from paddle.framework import core
    import numpy as np
    import torch

    core.set_prim_eager_enabled(True)
    core._set_prim_all_enabled(True)
    for place in ['cpu', 'gpu']:
        for accumulate in [False, True]:
            for x_shape, indices_shape, value_shape in [
                ([16], [10], [10]),
                ([16, 16], [20, 2], [20]),
                ([12, 13, 14], [88, 1], [88, 13, 14]),
                ([12, 13, 14], [88, 2], [88, 14]),
                ([12, 13, 14], [88, 3], [88]),
            ]:
                n_indices = indices_shape[0]
                index_dim_size = indices_shape[1] if len(indices_shape) > 1 else 1

                x_np = np.random.randn(*x_shape)
                indices_np = tuple(
                    [
                        np.random.randint(0, x_shape[i], [n_indices])
                        for i in range(max(index_dim_size, 1))
                    ]
                )
                value_np = np.random.randn(*value_shape).astype("float32")

                # run paddle
                x_pd = paddle.to_tensor(x_np.copy(), "float32", stop_gradient=False, place=place)
                indices_pd = [
                    paddle.to_tensor(indice.copy(), "int64", stop_gradient=True, place=place)
                    for indice in indices_np
                ]
                value_pd = paddle.to_tensor(value_np.copy(), "float32", stop_gradient=False, place=place)

                out_pd = paddle.index_put(x_pd, indices_pd, value_pd, accumulate=accumulate)
                out_pd = paddle.tanh(out_pd) #
                dout_np = np.random.randn(*out_pd.shape)

                dout_pd = paddle.to_tensor(dout_np.copy(), "float32", stop_gradient=False, place=place)
                dout_pd.stop_gradient = False

                dx_pd = paddle.grad(out_pd, x_pd, dout_pd, create_graph=True)[0] #
                ddx_np = np.random.randn(*dx_pd.shape)

                dvalue_pd = paddle.grad(out_pd, x_pd, dout_pd, create_graph=True)[0] #
                ddvalue_np = np.random.randn(*dvalue_pd.shape)

                ddx_pd = paddle.to_tensor(ddx_np.copy(), "float32", stop_gradient=False, place=place)
                ddvalue_pd = paddle.to_tensor(ddvalue_np.copy(), "float32", stop_gradient=False, place=place)
                ddout1_pd = paddle.grad(dx_pd, dout_pd, ddx_pd, create_graph=True)[0] #
                ddout2_pd = paddle.grad(dvalue_pd, dout_pd, ddvalue_pd, create_graph=True)[0] #

                # run torch
                x_pt = torch.as_tensor(x_np, dtype=torch.float32, device='cuda' if place == 'gpu' else place).requires_grad_(True)
                indices_pt = [
                    torch.as_tensor(indice, dtype=torch.int64, device='cuda' if place == 'gpu' else place).requires_grad_(False)
                    for indice in indices_np
                ]
                value_pt = torch.as_tensor(value_np, dtype=torch.float32, device='cuda' if place == 'gpu' else place).requires_grad_(True)

                out_pt = torch.index_put(x_pt, indices_pt, value_pt, accumulate=accumulate)
                out_pt = torch.tanh(out_pt)

                dout_pt = torch.as_tensor(dout_np, dtype=torch.float32, device='cuda' if place == 'gpu' else place).requires_grad_(True)
                dout_pt.stop_gradient = False

                dx_pt = torch.autograd.grad(out_pt, x_pt, dout_pt, create_graph=True)[0]

                dvalue_pt = torch.autograd.grad(out_pt, x_pt, dout_pt, create_graph=True)[0]

                ddx_pt = torch.as_tensor(ddx_np.copy(), dtype=torch.float32, device='cuda' if place == 'gpu' else place).requires_grad_(True)
                ddvalue_pt = torch.as_tensor(ddvalue_np.copy(), dtype=torch.float32, device='cuda' if place == 'gpu' else place).requires_grad_(True)
                ddout1_pt = torch.autograd.grad(dx_pt, dout_pt, ddx_pt, create_graph=True)[0]
                ddout2_pt = torch.autograd.grad(dvalue_pt, dout_pt, ddvalue_pt, create_graph=True)[0]

                # compare result
                ## output
                if accumulate:
                    np.testing.assert_allclose(out_pd.numpy(), out_pt.detach().cpu().numpy(), 1e-6, 1e-6)

                ## 1-order grad
                np.testing.assert_allclose(dx_pd.numpy(), dx_pt.detach().cpu().numpy(), 2e-6, 2e-6)
                np.testing.assert_allclose(dvalue_pd.numpy(), dvalue_pt.detach().cpu().numpy(), 2e-6, 2e-6)

                ## 2-order grad
                np.testing.assert_allclose(ddout1_pd.numpy(), ddout1_pt.detach().cpu().numpy(), 2e-6, 2e-6)
                np.testing.assert_allclose(ddout2_pd.numpy(), ddout2_pt.detach().cpu().numpy(), 2e-6, 2e-6)


def index_put_fwd_bwd_double_bwd_with_accumulate_false():
    import paddle
    from paddle.framework import core
    import numpy as np
    import torch

    core.set_prim_eager_enabled(True)
    core._set_prim_all_enabled(True)

    x_shape = [3, 4]
    indices_np = [
        np.asarray([0, 1, 1, 2, 2]),
        np.asarray([1, 3, 2, 1, 0]),
    ] # N = 2, D = 4
    indices_shape = [indices_np[0].shape[0], len(indices_np)] # N x D
    n_indices = indices_shape[0]
    value_shape = [n_indices]


    x_np = np.random.randn(*x_shape)
    value_np = np.random.randn(*value_shape).astype("float32")

    for place in ['cpu', 'gpu']:
        for accumulate in [False, True]:
            # run paddle
            x_pd = paddle.to_tensor(x_np.copy(), "float32", stop_gradient=False, place=place)
            indices_pd = [
                paddle.to_tensor(indice.copy(), "int64", stop_gradient=True, place=place)
                for indice in indices_np
            ]
            value_pd = paddle.to_tensor(value_np.copy(), "float32", stop_gradient=False, place=place)

            out_pd = paddle.index_put(x_pd, indices_pd, value_pd, accumulate=accumulate)
            out_pd = paddle.tanh(out_pd) #
            dout_np = np.random.randn(*out_pd.shape)

            dout_pd = paddle.to_tensor(dout_np.copy(), "float32", stop_gradient=False, place=place)
            dout_pd.stop_gradient = False

            dx_pd = paddle.grad(out_pd, x_pd, dout_pd, create_graph=True)[0] #
            ddx_np = np.random.randn(*dx_pd.shape)

            dvalue_pd = paddle.grad(out_pd, x_pd, dout_pd, create_graph=True)[0] #
            ddvalue_np = np.random.randn(*dvalue_pd.shape)

            ddx_pd = paddle.to_tensor(ddx_np.copy(), "float32", stop_gradient=False, place=place)
            ddvalue_pd = paddle.to_tensor(ddvalue_np.copy(), "float32", stop_gradient=False, place=place)
            ddout1_pd = paddle.grad(dx_pd, dout_pd, ddx_pd, create_graph=True)[0] #
            ddout2_pd = paddle.grad(dvalue_pd, dout_pd, ddvalue_pd, create_graph=True)[0] #

            # run torch
            x_pt = torch.as_tensor(x_np.copy(), dtype=torch.float32, device='cuda' if place == 'gpu' else place).requires_grad_(True)
            indices_pt = [
                torch.as_tensor(indice.copy(), dtype=torch.int64, device='cuda' if place == 'gpu' else place).requires_grad_(False)
                for indice in indices_np
            ]
            value_pt = torch.as_tensor(value_np.copy(), dtype=torch.float32, device='cuda' if place == 'gpu' else place).requires_grad_(True)

            out_pt = torch.index_put(x_pt, indices_pt, value_pt, accumulate=accumulate)
            out_pt = torch.tanh(out_pt)

            dout_pt = torch.as_tensor(dout_np.copy(), dtype=torch.float32, device='cuda' if place == 'gpu' else place).requires_grad_(True)
            dout_pt.stop_gradient = False

            dx_pt = torch.autograd.grad(out_pt, x_pt, dout_pt, create_graph=True)[0]

            dvalue_pt = torch.autograd.grad(out_pt, x_pt, dout_pt, create_graph=True)[0]

            ddx_pt = torch.as_tensor(ddx_np.copy(), dtype=torch.float32, device='cuda' if place == 'gpu' else place).requires_grad_(True)
            ddvalue_pt = torch.as_tensor(ddvalue_np.copy(), dtype=torch.float32, device='cuda' if place == 'gpu' else place).requires_grad_(True)
            ddout1_pt = torch.autograd.grad(dx_pt, dout_pt, ddx_pt, create_graph=True)[0]
            ddout2_pt = torch.autograd.grad(dvalue_pt, dout_pt, ddvalue_pt, create_graph=True)[0]

            # compare result
            ## output
            np.testing.assert_allclose(out_pd.numpy(), out_pt.detach().cpu().numpy(), 1e-6, 1e-6)

            ## 1-order grad
            np.testing.assert_allclose(dx_pd.numpy(), dx_pt.detach().cpu().numpy(), 1e-6, 1e-6)
            np.testing.assert_allclose(dvalue_pd.numpy(), dvalue_pt.detach().cpu().numpy(), 1e-6, 1e-6)

            ## 2-order grad
            np.testing.assert_allclose(ddout1_pd.numpy(), ddout1_pt.detach().cpu().numpy(), 2e-6, 2e-6)
            np.testing.assert_allclose(ddout2_pd.numpy(), ddout2_pt.detach().cpu().numpy(), 2e-6, 2e-6)

if __name__ == "__main__":
    test_index_put_fwd_bwd_double_bwd()
    index_put_fwd_bwd_double_bwd_with_accumulate_false()

Copy link

paddle-bot bot commented Oct 31, 2024

你的PR提交成功,感谢你对开源项目的贡献!
请关注后续CI自动化测试结果,详情请参考Paddle-CI手册
Your PR has been submitted. Thanks for your contribution!
Please wait for the result of CI firstly. See Paddle CI Manual for details.

@HydrogenSulfate HydrogenSulfate changed the title [Prim] Add index_put_double_grad [Kernel&Prim] Fix IndexPutCudaKernel for thread safe and add index_put_double_grad Nov 1, 2024
@HydrogenSulfate HydrogenSulfate changed the title [Kernel&Prim] Fix IndexPutCudaKernel for thread safe and add index_put_double_grad [Kernel&Prim] Fix IndexPutCudaKernel for thread safe and add index_put_double_grad Nov 1, 2024
@HydrogenSulfate HydrogenSulfate merged commit ba7343b into PaddlePaddle:develop Nov 4, 2024
28 checks passed
@HydrogenSulfate HydrogenSulfate deleted the add_index_put_double_grad branch November 4, 2024 02:51
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

Successfully merging this pull request may close these issues.

3 participants