From f2409b61a1e393c53fb6e5528c069c3641aeadab Mon Sep 17 00:00:00 2001 From: Petr Vyazovik Date: Thu, 13 Apr 2023 17:28:07 +0400 Subject: [PATCH 1/3] fw/ss_skb: Perform SKB linear data split when doing http/1 -> http/2 response transformation Signed-off-by: Petr Vyazovik --- fw/hpack.c | 16 +++++++--------- fw/http.c | 14 ++++++++++++-- fw/http_msg.c | 24 ++++++++++++++---------- fw/http_msg.h | 3 +-- fw/ss_skb.c | 27 +++++++++++++++++++++++++++ fw/ss_skb.h | 39 +++++++++++++++++++++++++++++++++++++++ lib/log.h | 15 ++++++++++++--- 7 files changed, 112 insertions(+), 26 deletions(-) diff --git a/fw/hpack.c b/fw/hpack.c index 4b72dd3bb..52fbbcf6b 100644 --- a/fw/hpack.c +++ b/fw/hpack.c @@ -2556,13 +2556,9 @@ tfw_hpack_write(const TfwStr *h_field, char *out_buf) return out_buf; } -#ifdef DEBUG +#if defined(DEBUG) && (DEBUG >= 4) #define T_DBG_PRINT_HPACK_RBTREE(tbl) tfw_hpack_rbtree_print(tbl) -#else -#define T_DBG_PRINT_HPACK_RBTREE(tbl) -#endif -#ifdef DEBUG /* Debug functions for printing red-black tree. */ static void tfw_hpack_rbtree_print_level(TfwHPackETbl *__restrict tbl, @@ -2576,7 +2572,7 @@ tfw_hpack_rbtree_print_level(TfwHPackETbl *__restrict tbl, tfw_hpack_rbtree_print_level(tbl, left, level + 1, pr_level); tfw_hpack_rbtree_print_level(tbl, right, level + 1, pr_level); } else if (root) { - T_DBG3("level (%u): rindex %lu hdr_len %hu color %hu " + T_DBGV("level (%u): rindex %lu hdr_len %hu color %hu " "parent %hd left %hd right %hd | hdr %.*s", level, root->rindex, root->hdr_len, root->color, root->parent, root->left, root->right, @@ -2604,15 +2600,17 @@ tfw_hpack_rbtree_print(TfwHPackETbl *__restrict tbl) { unsigned int pr_level, max_level; - T_DBG3("hpack rbtree:\n"); - T_DBG3("first %p last %p rb_len %hu rb_size %hu size %hu\n", + T_DBGV("hpack rbtree:\n"); + T_DBGV("first %p last %p rb_len %hu rb_size %hu size %hu\n", tbl->first, tbl->last, tbl->rb_len, tbl->rb_size, tbl->size); - T_DBG3("window %hu rbuf %p root %p idx_acc %lu\n", + T_DBGV("window %hu rbuf %p root %p idx_acc %lu\n", tbl->window, tbl->rbuf, tbl->root, tbl->idx_acc); max_level = tfw_hpack_rbtree_cacl_level(tbl, tbl->root); for (pr_level = 0; pr_level < max_level; ++pr_level) tfw_hpack_rbtree_print_level(tbl, tbl->root, 0, pr_level); } +#else +#define T_DBG_PRINT_HPACK_RBTREE(tbl) #endif /* diff --git a/fw/http.c b/fw/http.c index 6acbfa99d..c95a0b9c1 100644 --- a/fw/http.c +++ b/fw/http.c @@ -4991,6 +4991,11 @@ tfw_h1_resp_adjust_fwd(TfwHttpResp *resp) TFW_INC_STAT_BH(serv.msgs_otherr); return; } + + T_DBGV("[%d] %s: req %pK resp %pK: \n", smp_processor_id(), __func__, + req, resp); + SS_SKB_QUEUE_DUMP(&resp->msg.skb_head); + tfw_http_resp_fwd(resp); } @@ -5322,7 +5327,8 @@ tfw_h2_resp_adjust_fwd(TfwHttpResp *resp) WARN_ON_ONCE(mit->acc_len); tfw_h2_msg_transform_setup(mit, resp->msg.skb_head, true); - tfw_h2_msg_cutoff_headers(resp, &cleanup); + if (tfw_h2_msg_cutoff_headers(resp, &cleanup)) + goto clean; /* * Alloc room for frame header. After this call resp->pool @@ -5385,6 +5391,10 @@ tfw_h2_resp_adjust_fwd(TfwHttpResp *resp) if (unlikely(r)) goto clean; + T_DBGV("[%d] %s: req %pK resp %pK: \n", smp_processor_id(), __func__, + req, resp); + SS_SKB_QUEUE_DUMP(&resp->msg.skb_head); + tfw_h2_resp_fwd(resp); __tfw_h2_resp_cleanup(&cleanup); @@ -6698,7 +6708,7 @@ tfw_http_msg_process_generic(TfwConn *conn, TfwStream *stream, stream->msg, conn); } - T_DBG2("Add skb %p to message %p\n", skb, stream->msg); + T_DBG2("Add skb %pK to message %pK\n", skb, stream->msg); ss_skb_queue_tail(&stream->msg->skb_head, skb); if (TFW_CONN_TYPE(conn) & Conn_Clnt) diff --git a/fw/http_msg.c b/fw/http_msg.c index 9a9f0ddd0..c135cb2f8 100644 --- a/fw/http_msg.c +++ b/fw/http_msg.c @@ -1551,9 +1551,10 @@ __tfw_h2_msg_shrink_frag(struct sk_buff *skb, int frag_idx, const char *nbegin) * At this point we can't free SKBs, because data that they contain used * as source for message trasformation. */ -void +int tfw_h2_msg_cutoff_headers(TfwHttpResp *resp, TfwHttpRespCleanup* cleanup) { + int i, ret; char *begin, *end; TfwMsgIter *it = &resp->mit.iter; char* body = TFW_STR_CHUNK(&resp->body, 0)->data; @@ -1561,7 +1562,6 @@ tfw_h2_msg_cutoff_headers(TfwHttpResp *resp, TfwHttpRespCleanup* cleanup) char *off = body ? body : crlf->data + crlf->len; do { - unsigned char i; struct sk_buff *skb; struct skb_shared_info *si = skb_shinfo(it->skb); @@ -1571,11 +1571,18 @@ tfw_h2_msg_cutoff_headers(TfwHttpResp *resp, TfwHttpRespCleanup* cleanup) if ((begin <= off) && (end >= off)) { it->frag = -1; - /* TODO: Handle this case, most likely for low MTU - * it will be implemented as part of #1703 */ - return; + /* We would end up here if the start of the body or + * the end of CRLF lies within the linear data area + * of the current @it->skb + */ + ret = ss_skb_linear_transform(it->skb_head, + it->skb, body); + if (unlikely(ret)) + return ret; + break; } } + for (i = 0; i < si->nr_frags; i++) { skb_frag_t *f = &si->frags[i]; @@ -1620,22 +1627,19 @@ tfw_h2_msg_cutoff_headers(TfwHttpResp *resp, TfwHttpRespCleanup* cleanup) it->skb = it->skb->next; ss_skb_unlink(&it->skb_head, skb); ss_skb_queue_tail(&cleanup->skb_head, skb); - } while (it->skb != NULL); end: /* Pointer to data or CRLF not found in skbs. */ BUG_ON(!it->skb_head || !it->skb); - /* Trim skb linear data. This is ugly hotfix and must be removed after - * implementation of #1703 */ - it->skb->len -= skb_headlen(it->skb); - it->skb_head = it->skb; resp->msg.skb_head = it->skb; /* Start from zero fragment */ it->frag = -1; + + return 0; } /** diff --git a/fw/http_msg.h b/fw/http_msg.h index f71581e75..0b908b070 100644 --- a/fw/http_msg.h +++ b/fw/http_msg.h @@ -111,7 +111,6 @@ tfw_h2_msg_transform_setup(TfwHttpTransIter *mit, struct sk_buff *skb, { TfwMsgIter *iter = &mit->iter; - BUILD_BUG_ON(HTTP2_MAX_OFFSET <= FRAME_HEADER_SIZE); BUG_ON(!skb); BUG_ON(mit->frame_head); @@ -181,7 +180,7 @@ int __hdr_name_cmp(const TfwStr *hdr, const TfwStr *cmp_hdr); int __http_hdr_lookup(TfwHttpMsg *hm, const TfwStr *hdr); int tfw_h2_msg_write_data_pool(TfwHttpTransIter *mit, TfwPool* pool, const TfwStr *str, bool add_frag, bool has_body); -void tfw_h2_msg_cutoff_headers(TfwHttpResp *resp, TfwHttpRespCleanup* cleanup); +int tfw_h2_msg_cutoff_headers(TfwHttpResp *resp, TfwHttpRespCleanup* cleanup); int tfw_http_msg_insert(TfwMsgIter *it, char **off, const TfwStr *data); #define TFW_H2_MSG_HDR_ADD(hm, name, val, idx) \ diff --git a/fw/ss_skb.c b/fw/ss_skb.c index 8c43df3dc..0658f4ba8 100644 --- a/fw/ss_skb.c +++ b/fw/ss_skb.c @@ -1571,3 +1571,30 @@ ss_skb_add_frag(struct sk_buff *skb_head, struct sk_buff *skb, char* addr, return 0; } + +/* Using @split_point transform the remaining linear portion of original @skb + * to the first fragment of the same SKB. Existing fragments of @skb + * would moved to next SKB if necessary inside __split_linear_data(). + */ +int +ss_skb_linear_transform(struct sk_buff *skb_head, struct sk_buff *skb, + unsigned char *split_point) +{ + int fpos, r; + TfwStr _; + + if (!split_point) { + /* Usage of linear portion of SKB is not expected */ + ss_skb_put(skb, -skb_headlen(skb)); + skb->tail_lock = 1; + } else { + unsigned int off = split_point - skb->data; + + r = __split_linear_data(skb_head, skb, split_point, 0, &_, &fpos); + if (unlikely(r)) + return r; + ss_skb_put(skb, -off); + } + return 0; +} + diff --git a/fw/ss_skb.h b/fw/ss_skb.h index 1986e49de..082609300 100644 --- a/fw/ss_skb.h +++ b/fw/ss_skb.h @@ -256,6 +256,19 @@ ss_skb_alloc(size_t n) return skb; } +/* Move @cnt existing fragment descriptors within the SKB. + */ +static inline void +ss_skb_frags_move(struct sk_buff *skb, unsigned int new_pos, + unsigned int old_pos, unsigned int cnt) +{ + struct skb_shared_info *si = skb_shinfo(skb); + if (WARN_ON(!si->nr_frags)) + return; + BUG_ON(new_pos + cnt > MAX_SKB_FRAGS); + memmove(&si->frags[new_pos], &si->frags[old_pos], cnt * sizeof(skb_frag_t)); +} + #define SS_SKB_MAX_DATA_LEN (SKB_MAX_HEADER + MAX_SKB_FRAGS * PAGE_SIZE) char *ss_skb_fmt_src_addr(const struct sk_buff *skb, char *out_buf); @@ -288,4 +301,30 @@ int ss_skb_to_sgvec_with_new_pages(struct sk_buff *skb, struct scatterlist *sgl, struct page ***old_pages); int ss_skb_add_frag(struct sk_buff *skb_head, struct sk_buff *skb, char* addr, int frag_idx, size_t frag_sz); +int +ss_skb_linear_transform(struct sk_buff *skb_head, struct sk_buff *skb, + unsigned char *split_point); + +#if defined(DEBUG) && (DEBUG >= 4) +#define ss_skb_queue_for_each_do(queue, lambda) \ +do { \ + int i = 0; \ + struct sk_buff *skb = *queue; \ + if (likely(skb)) { \ + do { \ + lambda; \ + skb = skb->next; \ + } while (skb != *queue); \ + } \ +} while(0) + +#define SS_SKB_QUEUE_DUMP(queue) \ + ss_skb_queue_for_each_do(queue, { \ + pr_debug("#%2d skb => %pK\n", i++, skb); \ + skb_dump(KERN_DEBUG, skb, true); \ + }); +#else +#define SS_SKB_QUEUE_DUMP(...) +#endif + #endif /* __TFW_SS_SKB_H__ */ diff --git a/lib/log.h b/lib/log.h index ec1d07299..423d22036 100644 --- a/lib/log.h +++ b/lib/log.h @@ -45,7 +45,8 @@ enum { * -DDEBUG=2 - same as above, but also enables T_DBG2(). * -DDEBUG=3 - same as above plus T_DBG3(). * ...etc - * Currently there are only 3 levels: + * + * Currently there are only 4 levels: * 1 [USER] - information required to understand system behavior under * some load, only key events (e.g. new connections) should * be logged. The events could not be logged on normal level, @@ -55,9 +56,11 @@ enum { * 2 [SUPPORT] - key events at lower (component) levels (e.g. TDB or SS). * Only events required for technical support should be logged * on this level; - * 3 [DEVELOP] - verbose logging, used for engineer debugging internal + * 3 [DEVELOP] - extended logging, used for engineer debugging internal * algorithms and so on. Typically for single slow connection * cases. + * 4 [VERBOSE] - all of the above + verbose dumping of internal structures + * content, e.g. SKB queues, rbtree, etc. */ #define __BNR "[tempesta " BANNER "] " #define __T_DBG1(...) pr_debug(__BNR " " __VA_ARGS__) @@ -76,7 +79,7 @@ enum { #define T_DBG2(...) #endif -#if defined(DEBUG) && (DEBUG == 3) +#if defined(DEBUG) && (DEBUG >= 3) #define T_DBG3(...) __T_DBG3(__VA_ARGS__) #define T_DBG3_BUF(str, buf, len) \ @@ -137,4 +140,10 @@ do { \ #define T_LOG_NL(...) pr_info(__BNR __VA_ARGS__) #endif +#if defined(DEBUG) && (DEBUG >= 4) +#define T_DBGV(...) pr_debug(__BNR "# " __VA_ARGS__) +#else +#define T_DBGV(...) +#endif + #endif /* __LIB_LOG_H__ */ From 239f267038c552b0dbec85fc8e82efc9f9c6a9fe Mon Sep 17 00:00:00 2001 From: Petr Vyazovik Date: Mon, 27 Mar 2023 13:07:28 +0400 Subject: [PATCH 2/3] fw/{server,vhost}: Enabled debug prints Signed-off-by: Petr Vyazovik --- Makefile | 4 +++- fw/server.c | 5 +++++ fw/vhost.c | 5 +++++ 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 97497a965..9f20a4a57 100644 --- a/Makefile +++ b/Makefile @@ -68,6 +68,8 @@ DBG_HTTP_SESS ?= 0 DBG_HTTP_STREAM ?= 0 DBG_HPACK ?= 0 DBG_CACHE ?= 0 +DBG_SRV ?= 0 +DBG_VHOST ?= 0 DBG_TEST ?= 0 TFW_CFLAGS += -DDBG_CFG=$(DBG_CFG) -DDBG_HTTP_PARSER=$(DBG_HTTP_PARSER) TFW_CFLAGS += -DDBG_SS=$(DBG_SS) -DDBG_TLS=$(DBG_TLS) -DDBG_WS=$(DBG_WS) @@ -76,7 +78,7 @@ TFW_CFLAGS += -DDBG_HTTP_FRAME=$(DBG_HTTP_FRAME) TFW_CFLAGS += -DDBG_HTTP_SESS=$(DBG_HTTP_SESS) TFW_CFLAGS += -DDBG_HTTP_STREAM=$(DBG_HTTP_STREAM) TFW_CFLAGS += -DDBG_HPACK=$(DBG_HPACK) -DDBG_CACHE=$(DBG_CACHE) -TFW_CFLAGS += -DDBG_TEST=$(DBG_TEST) +TFW_CFLAGS += -DDBG_SRV=$(DBG_SRV) -DDBG_VHOST=$(DBG_VHOST) -DDBG_TEST=$(DBG_TEST) # By default Tempesta TLS randomizes elliptic curve points using RDRAND # instruction, which provides a high speed random numbers generator. diff --git a/fw/server.c b/fw/server.c index a471b0104..97e5e309a 100644 --- a/fw/server.c +++ b/fw/server.c @@ -23,6 +23,11 @@ #include #include +#undef DEBUG +#if DBG_SRV > 0 +#define DEBUG DBG_SRV +#endif + #include "lib/hash.h" #include "apm.h" #include "client.h" diff --git a/fw/vhost.c b/fw/vhost.c index f80270e06..143b18688 100644 --- a/fw/vhost.c +++ b/fw/vhost.c @@ -20,6 +20,11 @@ #include #include +#undef DEBUG +#if DBG_VHOST > 0 +#define DEBUG DBG_VHOST +#endif + #include "tempesta_fw.h" #include "hash.h" #include "http.h" From 56e19d2bb5d8c11d20d1e66d8753ce35c46137eb Mon Sep 17 00:00:00 2001 From: Petr Vyazovik Date: Thu, 13 Apr 2023 17:47:02 +0400 Subject: [PATCH 3/3] linux-patch: Reserve some space in headroom of outgoing SKBs during IPv4 fragmentation to overcome the issue with the lack of space for transport headers Signed-off-by: Petr Vyazovik --- linux-5.10.35.patch | 76 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 62 insertions(+), 14 deletions(-) diff --git a/linux-5.10.35.patch b/linux-5.10.35.patch index c78b51fe9..45979fb3c 100644 --- a/linux-5.10.35.patch +++ b/linux-5.10.35.patch @@ -506,10 +506,10 @@ index 0dcd51fee..9a09576a8 100644 struct kvec; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h -index e37480b5f..617f4e76b 100644 +index e37480b5f..8236d5929 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h -@@ -154,11 +154,32 @@ static inline bool dev_xmit_complete(int rc) +@@ -154,11 +154,22 @@ static inline bool dev_xmit_complete(int rc) # define LL_MAX_HEADER 32 #endif @@ -521,26 +521,16 @@ index e37480b5f..617f4e76b 100644 + * to allocate 16 more bytes (5 - TLS header, 8 - IV, 3 - alignment). + */ +#define TLS_MAX_HDR 16 -+/* -+ * For fast transformation of HTTP/1.1 responses into HTTP/2 format, Tempesta -+ * uses zero-copy in-place rewriting of the response data, right in original -+ * skb. HTTP/2 data is almost always smaller of its source HTTP/1.1 data, but -+ * for the sake of robustness we use 32-byte initial offset in front of skb -+ * data. Thus, in order to guarantee the stack headers to fit, we should -+ * increase the total space for them. -+ */ -+#define HTTP2_MAX_OFFSET 32 +#else +#define TLS_MAX_HDR 0 -+#define HTTP2_MAX_OFFSET 0 +#endif #if !IS_ENABLED(CONFIG_NET_IPIP) && !IS_ENABLED(CONFIG_NET_IPGRE) && \ !IS_ENABLED(CONFIG_IPV6_SIT) && !IS_ENABLED(CONFIG_IPV6_TUNNEL) -#define MAX_HEADER LL_MAX_HEADER -+#define MAX_HEADER (LL_MAX_HEADER + TLS_MAX_HDR + HTTP2_MAX_OFFSET) ++#define MAX_HEADER (LL_MAX_HEADER + TLS_MAX_HDR) #else -#define MAX_HEADER (LL_MAX_HEADER + 48) -+#define MAX_HEADER (LL_MAX_HEADER + 48 + TLS_MAX_HDR + HTTP2_MAX_OFFSET) ++#define MAX_HEADER (LL_MAX_HEADER + 48 + TLS_MAX_HDR) #endif /* @@ -2040,6 +2030,64 @@ index 45fb450b4..48da5be43 100644 } offset++; +diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c +index 97975bed4..672f21290 100644 +--- a/net/ipv4/ip_output.c ++++ b/net/ipv4/ip_output.c +@@ -82,6 +82,9 @@ + #include + #include + #include ++#ifdef CONFIG_SECURITY_TEMPESTA ++#include ++#endif + + static int + ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, +@@ -702,7 +705,31 @@ struct sk_buff *ip_frag_next(struct sk_buff *skb, struct ip_frag_state *state) + } + + /* Allocate buffer */ ++#ifdef CONFIG_SECURITY_TEMPESTA ++ /* ++ * Since Tempesta FW tries to reuse incoming SKBs containing the response ++ * from the backend, sometimes we might encounter an SKB with quite a small ++ * head room, which is not big enough to accommodate all the transport headers ++ * and TLS overhead. ++ * It usually the case when working over loopback, tun/tap, bridge or similar ++ * interfaces with small MTU. The issue is specific to aforementioned ifaces ++ * because the outgoing SKB would be injected back to the stack. ++ * In order not to reallocate sk_buffs' headroom on RX path, ++ * allocate and reserve a little bit more memory on TX path. ++ * Even though it would introduce some memory overhead, it's still ++ * cheaper than doing transformation. ++ * ++ * It seems like no such actions are required for IPv6 counterparts: ++ * ip6_fragment() / ip6_frag_next() due to the fact that the ++ * lowest acceptable MTU (1280) is sufficient to fit all the headers. ++ * ++ * When receiving SKBs from the outter world, the NIC driver should ++ * allocate and reserve all necessary space by itself. ++ */ ++ skb2 = alloc_skb(len + state->hlen + MAX_TCP_HEADER, GFP_ATOMIC); ++#else + skb2 = alloc_skb(len + state->hlen + state->ll_rs, GFP_ATOMIC); ++#endif + if (!skb2) + return ERR_PTR(-ENOMEM); + +@@ -711,7 +738,11 @@ struct sk_buff *ip_frag_next(struct sk_buff *skb, struct ip_frag_state *state) + */ + + ip_copy_metadata(skb2, skb); ++#ifdef CONFIG_SECURITY_TEMPESTA ++ skb_reserve(skb2, MAX_TCP_HEADER); ++#else + skb_reserve(skb2, state->ll_rs); ++#endif + skb_put(skb2, len + state->hlen); + skb_reset_network_header(skb2); + skb2->transport_header = skb2->network_header + state->hlen; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 2384ac048..920b1f01f 100644 --- a/net/ipv4/tcp.c