Skip to content

Commit

Permalink
Merge branch 'net-smc-smc-intra-os-shortcut-with-loopback-ism'
Browse files Browse the repository at this point in the history
Wen Gu says:

====================
net/smc: SMC intra-OS shortcut with loopback-ism

This patch set acts as the second part of the new version of [1] (The first
part can be referred from [2]), the updated things of this version are listed
at the end.

- Background

SMC-D is now used in IBM z with ISM function to optimize network interconnect
for intra-CPC communications. Inspired by this, we try to make SMC-D available
on the non-s390 architecture through a software-implemented Emulated-ISM device,
that is the loopback-ism device here, to accelerate inter-process or
inter-containers communication within the same OS instance.

- Design

This patch set includes 3 parts:

 - Patch #1: some prepare work for loopback-ism.
 - Patch #2-#7: implement loopback-ism device and adapt SMC-D for it.
   loopback-ism now serves only SMC and no userspace interfaces exposed.
 - Patch #8-#11: memory copy optimization for intra-OS scenario.

The loopback-ism device is designed as an ISMv2 device and not be limited to
a specific net namespace, ends of both inter-process connection (1/1' in diagram
below) or inter-container connection (2/2' in diagram below) can find the same
available loopback-ism and choose it during the CLC handshake.

 Container 1 (ns1)                              Container 2 (ns2)
 +-----------------------------------------+    +-------------------------+
 | +-------+      +-------+      +-------+ |    |        +-------+        |
 | | App A |      | App B |      | App C | |    |        | App D |<-+     |
 | +-------+      +---^---+      +-------+ |    |        +-------+  |(2') |
 |     |127.0.0.1 (1')|             |192.168.0.11       192.168.0.12|     |
 |  (1)|   +--------+ | +--------+  |(2)   |    | +--------+   +--------+ |
 |     `-->|   lo   |-` |  eth0  |<-`      |    | |   lo   |   |  eth0  | |
 +---------+--|---^-+---+-----|--+---------+    +-+--------+---+-^------+-+
              |   |           |                                  |
 Kernel       |   |           |                                  |
 +----+-------v---+-----------v----------------------------------+---+----+
 |    |                            TCP                               |    |
 |    |                                                              |    |
 |    +--------------------------------------------------------------+    |
 |                                                                        |
 |                           +--------------+                             |
 |                           | smc loopback |                             |
 +---------------------------+--------------+-----------------------------+

loopback-ism device creates DMBs (shared memory) for each connection peer.
Since data transfer occurs within the same kernel, the sndbuf of each peer
is only a descriptor and point to the same memory region as peer DMB, so that
the data copy from sndbuf to peer DMB can be avoided in loopback-ism case.

 Container 1 (ns1)                              Container 2 (ns2)
 +-----------------------------------------+    +-------------------------+
 | +-------+                               |    |        +-------+        |
 | | App C |-----+                         |    |        | App D |        |
 | +-------+     |                         |    |        +-^-----+        |
 |               |                         |    |          |              |
 |           (2) |                         |    |     (2') |              |
 |               |                         |    |          |              |
 +---------------|-------------------------+    +----------|--------------+
                 |                                         |
 Kernel          |                                         |
 +---------------|-----------------------------------------|--------------+
 | +--------+ +--v-----+                           +--------+ +--------+  |
 | |dmb_desc| |snd_desc|                           |dmb_desc| |snd_desc|  |
 | +-----|--+ +--|-----+                           +-----|--+ +--------+  |
 | +-----|--+    |                                 +-----|--+             |
 | | DMB C  |    +---------------------------------| DMB D  |             |
 | +--------+                                      +--------+             |
 |                                                                        |
 |                           +--------------+                             |
 |                           | smc loopback |                             |
 +---------------------------+--------------+-----------------------------+

- Benchmark Test

 * Test environments:
      - VM with Intel Xeon Platinum 8 core 2.50GHz, 16 GiB mem.
      - SMC sndbuf/DMB size 1MB.

 * Test object:
      - TCP: run on TCP loopback.
      - SMC lo: run on SMC loopback-ism.

1. ipc-benchmark (see [3])

 - ./<foo> -c 1000000 -s 100

                            TCP                  SMC-lo
Message
rate (msg/s)              84991                  151293(+78.01%)

2. sockperf

 - serv: <smc_run> sockperf sr --tcp
 - clnt: <smc_run> sockperf { tp | pp } --tcp --msg-size={ 64000 for tp | 14 for pp } -i 127.0.0.1 -t 30

                            TCP                  SMC-lo
Bandwidth(MBps)        5033.569                7987.732(+58.69%)
Latency(us)               5.986                   3.398(-43.23%)

3. nginx/wrk

 - serv: <smc_run> nginx
 - clnt: <smc_run> wrk -t 8 -c 1000 -d 30 http://127.0.0.1:80

                           TCP                   SMC-lo
Requests/s           187951.76                267107.90(+42.12%)

4. redis-benchmark

 - serv: <smc_run> redis-server
 - clnt: <smc_run> redis-benchmark -h 127.0.0.1 -q -t set,get -n 400000 -c 200 -d 1024

                           TCP                   SMC-lo
GET(Requests/s)       86132.64                118133.49(+37.15%)
SET(Requests/s)       87374.40                122887.86(+40.65%)

Change log:
v7->v6
- Patch #2: minor: remove unnecessary 'return' of inline smc_loopback_exit().
- Patch #10: minor: directly return 0 instead of 'rc' in smcd_cdc_msg_send().
- all: collect the Reviewed-by tags.

v6->RFC v5
Link: https://lore.kernel.org/netdev/[email protected]/
- Patch #2: make the use of CONFIG_SMC_LO cleaner.
- Patch #5: mark some smcd_ops that loopback-ism doesn't support as
  optional and check for the support when they are called.
- Patch #7: keep loopback-ism at the beginning of the SMC-D device list.
- Some expression changes in commit logs and comments.

RFC v5->RFC v4:
Link: https://lore.kernel.org/netdev/[email protected]/
- Patch #2: minor changes in description of config SMC_LO and comments.
- Patch #10: minor changes in comments and if(smc_ism_support_dmb_nocopy())
  check in smcd_cdc_msg_send().
- Patch #3: change smc_lo_generate_id() to smc_lo_generate_ids() and SMC_LO_CHID
  to SMC_LO_RESERVED_CHID.
- Patch #5: memcpy while holding the ldev->dmb_ht_lock.
- Some expression changes in commit logs.

RFC v4->v3:
Link: https://lore.kernel.org/netdev/[email protected]/
- The merge window of v6.9 is open, so post this series as an RFC.
- Patch #6: since some information fed back by smc_nl_handle_smcd_dev() dose
  not apply to Emulated-ISM (including loopback-ism here), loopback-ism is
  not exposed through smc netlink for the time being. we may refactor this
  part when smc netlink interface is updated.

v3->v2:
Link: https://lore.kernel.org/netdev/[email protected]/
- Patch #11: use tasklet_schedule(&conn->rx_tsklet) instead of smcd_cdc_rx_handler()
  to avoid possible recursive locking of conn->send_lock and use {read|write}_lock_bh()
  to acquire dmb_ht_lock.

v2->v1:
Link: https://lore.kernel.org/netdev/[email protected]/
- All the patches: changed the term virtual-ISM to Emulated-ISM as defined by SMCv2.1.
- Patch #3: optimized the description of SMC_LO config. Avoid exposing loopback-ism
  to sysfs and remove all the knobs until future definition clear.
- Patch #3: try to make lockdep happy by using read_lock_bh() in smc_lo_move_data().
- Patch #6: defaultly use physical contiguous DMB buffers.
- Patch #11: defaultly enable DMB no-copy for loopback-ism and free the DMB in
  unregister_dmb or detach_dmb when dmb_node->refcnt reaches 0, instead of using
  wait_event to keep waiting in unregister_dmb.

v1->RFC:
Link: https://lore.kernel.org/netdev/[email protected]/
- Patch #9: merge rx_bytes and tx_bytes as xfer_bytes statistics:
  /sys/devices/virtual/smc/loopback-ism/xfer_bytes
- Patch #10: add support_dmb_nocopy operation to check if SMC-D device supports
  merging sndbuf with peer DMB.
- Patch #13 & #14: introduce loopback-ism device control of DMB memory type and
  control of whether to merge sndbuf and DMB. They can be respectively set by:
  /sys/devices/virtual/smc/loopback-ism/dmb_type
  /sys/devices/virtual/smc/loopback-ism/dmb_copy
  The motivation for these two control is that a performance bottleneck was
  found when using vzalloced DMB and sndbuf is merged with DMB, and there are
  many CPUs and CONFIG_HARDENED_USERCOPY is set [4]. The bottleneck is caused
  by the lock contention in vmap_area_lock [5] which is involved in memcpy_from_msg()
  or memcpy_to_msg(). Currently, Uladzislau Rezki is working on mitigating the
  vmap lock contention [6]. It has significant effects, but using virtual memory
  still has additional overhead compared to using physical memory.
  So this new version provides controls of dmb_type and dmb_copy to suit
  different scenarios.
- Some minor changes and comments improvements.

RFC->old version([1]):
Link: https://lore.kernel.org/netdev/[email protected]/
- Patch #1: improve the loopback-ism dump, it shows as follows now:
  # smcd d
  FID  Type  PCI-ID        PCHID  InUse  #LGs  PNET-ID
  0000 0     loopback-ism  ffff   No        0
- Patch #3: introduce the smc_ism_set_v2_capable() helper and set
  smc_ism_v2_capable when ISMv2 or virtual ISM is registered,
  regardless of whether there is already a device in smcd device list.
- Patch #3: loopback-ism will be added into /sys/devices/virtual/smc/loopback-ism/.
- Patch #8: introduce the runtime switch /sys/devices/virtual/smc/loopback-ism/active
  to activate or deactivate the loopback-ism.
- Patch #9: introduce the statistics of loopback-ism by
  /sys/devices/virtual/smc/loopback-ism/{{tx|rx}_tytes|dmbs_cnt}.
- Some minor changes and comments improvements.

[1] https://lore.kernel.org/netdev/[email protected]/
[2] https://lore.kernel.org/netdev/[email protected]/
[3] https://github.com/goldsborough/ipc-bench
[4] https://lore.kernel.org/all/[email protected]/
[5] https://lore.kernel.org/all/[email protected]/
[6] https://lore.kernel.org/all/[email protected]/
====================

Link: https://lore.kernel.org/r/[email protected]
Signed-off-by: Paolo Abeni <[email protected]>
  • Loading branch information
Paolo Abeni committed Apr 30, 2024
2 parents e5c5f35 + c3a910f commit e458a9a
Show file tree
Hide file tree
Showing 12 changed files with 721 additions and 28 deletions.
2 changes: 1 addition & 1 deletion drivers/s390/net/ism_drv.c
Original file line number Diff line number Diff line change
Expand Up @@ -745,7 +745,7 @@ static int smcd_query_rgid(struct smcd_dev *smcd, struct smcd_gid *rgid,
}

static int smcd_register_dmb(struct smcd_dev *smcd, struct smcd_dmb *dmb,
struct ism_client *client)
void *client)
{
return ism_register_dmb(smcd->priv, (struct ism_dmb *)dmb, client);
}
Expand Down
21 changes: 13 additions & 8 deletions include/net/smc.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@ struct smcd_dmb {
#define ISM_ERROR 0xFFFF

struct smcd_dev;
struct ism_client;

struct smcd_gid {
u64 gid;
Expand All @@ -58,26 +57,32 @@ struct smcd_ops {
int (*query_remote_gid)(struct smcd_dev *dev, struct smcd_gid *rgid,
u32 vid_valid, u32 vid);
int (*register_dmb)(struct smcd_dev *dev, struct smcd_dmb *dmb,
struct ism_client *client);
void *client);
int (*unregister_dmb)(struct smcd_dev *dev, struct smcd_dmb *dmb);
int (*add_vlan_id)(struct smcd_dev *dev, u64 vlan_id);
int (*del_vlan_id)(struct smcd_dev *dev, u64 vlan_id);
int (*set_vlan_required)(struct smcd_dev *dev);
int (*reset_vlan_required)(struct smcd_dev *dev);
int (*signal_event)(struct smcd_dev *dev, struct smcd_gid *rgid,
u32 trigger_irq, u32 event_code, u64 info);
int (*move_data)(struct smcd_dev *dev, u64 dmb_tok, unsigned int idx,
bool sf, unsigned int offset, void *data,
unsigned int size);
int (*supports_v2)(void);
void (*get_local_gid)(struct smcd_dev *dev, struct smcd_gid *gid);
u16 (*get_chid)(struct smcd_dev *dev);
struct device* (*get_dev)(struct smcd_dev *dev);

/* optional operations */
int (*add_vlan_id)(struct smcd_dev *dev, u64 vlan_id);
int (*del_vlan_id)(struct smcd_dev *dev, u64 vlan_id);
int (*set_vlan_required)(struct smcd_dev *dev);
int (*reset_vlan_required)(struct smcd_dev *dev);
int (*signal_event)(struct smcd_dev *dev, struct smcd_gid *rgid,
u32 trigger_irq, u32 event_code, u64 info);
int (*support_dmb_nocopy)(struct smcd_dev *dev);
int (*attach_dmb)(struct smcd_dev *dev, struct smcd_dmb *dmb);
int (*detach_dmb)(struct smcd_dev *dev, u64 token);
};

struct smcd_dev {
const struct smcd_ops *ops;
void *priv;
void *client;
struct list_head list;
spinlock_t lock;
struct smc_connection **conn;
Expand Down
13 changes: 13 additions & 0 deletions net/smc/Kconfig
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,16 @@ config SMC_DIAG
smcss.

if unsure, say Y.

config SMC_LO
bool "SMC intra-OS shortcut with loopback-ism"
depends on SMC
default n
help
SMC_LO enables the creation of an Emulated-ISM device named
loopback-ism in SMC and makes use of it for transferring data
when communication occurs within the same OS. This helps in
convenient testing of SMC-D since loopback-ism is independent
of architecture or hardware.

if unsure, say N.
1 change: 1 addition & 0 deletions net/smc/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o
smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o smc_netlink.o smc_stats.o
smc-y += smc_tracepoint.o
smc-$(CONFIG_SYSCTL) += smc_sysctl.o
smc-$(CONFIG_SMC_LO) += smc_loopback.o
28 changes: 27 additions & 1 deletion net/smc/af_smc.c
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@
#include "smc_stats.h"
#include "smc_tracepoint.h"
#include "smc_sysctl.h"
#include "smc_loopback.h"

static DEFINE_MUTEX(smc_server_lgr_pending); /* serialize link group
* creation on server
Expand Down Expand Up @@ -1435,6 +1436,14 @@ static int smc_connect_ism(struct smc_sock *smc,
}

smc_conn_save_peer_info(smc, aclc);

if (smc_ism_support_dmb_nocopy(smc->conn.lgr->smcd)) {
rc = smcd_buf_attach(smc);
if (rc) {
rc = SMC_CLC_DECL_MEM; /* try to fallback */
goto connect_abort;
}
}
smc_close_init(smc);
smc_rx_init(smc);
smc_tx_init(smc);
Expand Down Expand Up @@ -2539,6 +2548,14 @@ static void smc_listen_work(struct work_struct *work)
mutex_unlock(&smc_server_lgr_pending);
}
smc_conn_save_peer_info(new_smc, cclc);

if (ini->is_smcd &&
smc_ism_support_dmb_nocopy(new_smc->conn.lgr->smcd)) {
rc = smcd_buf_attach(new_smc);
if (rc)
goto out_decl;
}

smc_listen_out_connected(new_smc);
SMC_STAT_SERV_SUCC_INC(sock_net(newclcsock->sk), ini);
goto out_free;
Expand Down Expand Up @@ -3555,15 +3572,23 @@ static int __init smc_init(void)
goto out_sock;
}

rc = smc_loopback_init();
if (rc) {
pr_err("%s: smc_loopback_init fails with %d\n", __func__, rc);
goto out_ib;
}

rc = tcp_register_ulp(&smc_ulp_ops);
if (rc) {
pr_err("%s: tcp_ulp_register fails with %d\n", __func__, rc);
goto out_ib;
goto out_lo;
}

static_branch_enable(&tcp_have_smc);
return 0;

out_lo:
smc_loopback_exit();
out_ib:
smc_ib_unregister_client();
out_sock:
Expand Down Expand Up @@ -3601,6 +3626,7 @@ static void __exit smc_exit(void)
tcp_unregister_ulp(&smc_ulp_ops);
sock_unregister(PF_SMC);
smc_core_exit();
smc_loopback_exit();
smc_ib_unregister_client();
smc_ism_exit();
destroy_workqueue(smc_close_wq);
Expand Down
36 changes: 34 additions & 2 deletions net/smc/smc_cdc.c
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include "smc_tx.h"
#include "smc_rx.h"
#include "smc_close.h"
#include "smc_ism.h"

/********************************** send *************************************/

Expand Down Expand Up @@ -255,6 +256,14 @@ int smcd_cdc_msg_send(struct smc_connection *conn)
return rc;
smc_curs_copy(&conn->rx_curs_confirmed, &curs, conn);
conn->local_rx_ctrl.prod_flags.cons_curs_upd_req = 0;

if (smc_ism_support_dmb_nocopy(conn->lgr->smcd))
/* if local sndbuf shares the same memory region with
* peer DMB, then don't update the tx_curs_fin
* and sndbuf_space until peer has consumed the data.
*/
return 0;

/* Calculate transmitted data and increment free send buffer space */
diff = smc_curs_diff(conn->sndbuf_desc->len, &conn->tx_curs_fin,
&conn->tx_curs_sent);
Expand All @@ -266,7 +275,7 @@ int smcd_cdc_msg_send(struct smc_connection *conn)
smc_curs_copy(&conn->tx_curs_fin, &conn->tx_curs_sent, conn);

smc_tx_sndbuf_nonfull(smc);
return rc;
return 0;
}

/********************************* receive ***********************************/
Expand Down Expand Up @@ -323,7 +332,7 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc,
{
union smc_host_cursor cons_old, prod_old;
struct smc_connection *conn = &smc->conn;
int diff_cons, diff_prod;
int diff_cons, diff_prod, diff_tx;

smc_curs_copy(&prod_old, &conn->local_rx_ctrl.prod, conn);
smc_curs_copy(&cons_old, &conn->local_rx_ctrl.cons, conn);
Expand All @@ -339,6 +348,29 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc,
atomic_add(diff_cons, &conn->peer_rmbe_space);
/* guarantee 0 <= peer_rmbe_space <= peer_rmbe_size */
smp_mb__after_atomic();

/* if local sndbuf shares the same memory region with
* peer RMB, then update tx_curs_fin and sndbuf_space
* here since peer has already consumed the data.
*/
if (conn->lgr->is_smcd &&
smc_ism_support_dmb_nocopy(conn->lgr->smcd)) {
/* Calculate consumed data and
* increment free send buffer space.
*/
diff_tx = smc_curs_diff(conn->sndbuf_desc->len,
&conn->tx_curs_fin,
&conn->local_rx_ctrl.cons);
/* increase local sndbuf space and fin_curs */
smp_mb__before_atomic();
atomic_add(diff_tx, &conn->sndbuf_space);
/* guarantee 0 <= sndbuf_space <= sndbuf_desc->len */
smp_mb__after_atomic();
smc_curs_copy(&conn->tx_curs_fin,
&conn->local_rx_ctrl.cons, conn);

smc_tx_sndbuf_nonfull(smc);
}
}

diff_prod = smc_curs_diff(conn->rmb_desc->len, &prod_old,
Expand Down
61 changes: 60 additions & 1 deletion net/smc/smc_core.c
Original file line number Diff line number Diff line change
Expand Up @@ -1149,6 +1149,20 @@ static void smcr_buf_unuse(struct smc_buf_desc *buf_desc, bool is_rmb,
}
}

static void smcd_buf_detach(struct smc_connection *conn)
{
struct smcd_dev *smcd = conn->lgr->smcd;
u64 peer_token = conn->peer_token;

if (!conn->sndbuf_desc)
return;

smc_ism_detach_dmb(smcd, peer_token);

kfree(conn->sndbuf_desc);
conn->sndbuf_desc = NULL;
}

static void smc_buf_unuse(struct smc_connection *conn,
struct smc_link_group *lgr)
{
Expand Down Expand Up @@ -1192,6 +1206,8 @@ void smc_conn_free(struct smc_connection *conn)
if (lgr->is_smcd) {
if (!list_empty(&lgr->list))
smc_ism_unset_conn(conn);
if (smc_ism_support_dmb_nocopy(lgr->smcd))
smcd_buf_detach(conn);
tasklet_kill(&conn->rx_tsklet);
} else {
smc_cdc_wait_pend_tx_wr(conn);
Expand Down Expand Up @@ -1445,6 +1461,8 @@ static void smc_conn_kill(struct smc_connection *conn, bool soft)
smc_sk_wake_ups(smc);
if (conn->lgr->is_smcd) {
smc_ism_unset_conn(conn);
if (smc_ism_support_dmb_nocopy(conn->lgr->smcd))
smcd_buf_detach(conn);
if (soft)
tasklet_kill(&conn->rx_tsklet);
else
Expand Down Expand Up @@ -2464,12 +2482,18 @@ int smc_buf_create(struct smc_sock *smc, bool is_smcd)
int rc;

/* create send buffer */
if (is_smcd &&
smc_ism_support_dmb_nocopy(smc->conn.lgr->smcd))
goto create_rmb;

rc = __smc_buf_create(smc, is_smcd, false);
if (rc)
return rc;

create_rmb:
/* create rmb */
rc = __smc_buf_create(smc, is_smcd, true);
if (rc) {
if (rc && smc->conn.sndbuf_desc) {
down_write(&smc->conn.lgr->sndbufs_lock);
list_del(&smc->conn.sndbuf_desc->list);
up_write(&smc->conn.lgr->sndbufs_lock);
Expand All @@ -2479,6 +2503,41 @@ int smc_buf_create(struct smc_sock *smc, bool is_smcd)
return rc;
}

int smcd_buf_attach(struct smc_sock *smc)
{
struct smc_connection *conn = &smc->conn;
struct smcd_dev *smcd = conn->lgr->smcd;
u64 peer_token = conn->peer_token;
struct smc_buf_desc *buf_desc;
int rc;

buf_desc = kzalloc(sizeof(*buf_desc), GFP_KERNEL);
if (!buf_desc)
return -ENOMEM;

/* The ghost sndbuf_desc describes the same memory region as
* peer RMB. Its lifecycle is consistent with the connection's
* and it will be freed with the connections instead of the
* link group.
*/
rc = smc_ism_attach_dmb(smcd, peer_token, buf_desc);
if (rc)
goto free;

smc->sk.sk_sndbuf = buf_desc->len;
buf_desc->cpu_addr =
(u8 *)buf_desc->cpu_addr + sizeof(struct smcd_cdc_msg);
buf_desc->len -= sizeof(struct smcd_cdc_msg);
conn->sndbuf_desc = buf_desc;
conn->sndbuf_desc->used = 1;
atomic_set(&conn->sndbuf_space, conn->sndbuf_desc->len);
return 0;

free:
kfree(buf_desc);
return rc;
}

static inline int smc_rmb_reserve_rtoken_idx(struct smc_link_group *lgr)
{
int i;
Expand Down
1 change: 1 addition & 0 deletions net/smc/smc_core.h
Original file line number Diff line number Diff line change
Expand Up @@ -557,6 +557,7 @@ void smc_smcd_terminate(struct smcd_dev *dev, struct smcd_gid *peer_gid,
void smc_smcd_terminate_all(struct smcd_dev *dev);
void smc_smcr_terminate_all(struct smc_ib_device *smcibdev);
int smc_buf_create(struct smc_sock *smc, bool is_smcd);
int smcd_buf_attach(struct smc_sock *smc);
int smc_uncompress_bufsize(u8 compressed);
int smc_rmb_rtoken_handling(struct smc_connection *conn, struct smc_link *link,
struct smc_clc_msg_accept_confirm *clc);
Expand Down
Loading

0 comments on commit e458a9a

Please sign in to comment.