From 09f70b3ca87a47b2481ef450fd6e4b6969bc71d5 Mon Sep 17 00:00:00 2001 From: supervisor Date: Fri, 5 Jul 2024 17:56:34 +0400 Subject: [PATCH] Implemented regex for locations and httptables Made regex configuration same way as in Nginx. --- Makefile | 2 +- fw/cfg.c | 97 +- fw/cfg.h | 10 + fw/http_match.c | 125 +- fw/http_match.h | 10 +- fw/http_tbl.c | 6 +- fw/str.h | 2 + fw/t/unit/test_http_match.c | 3 +- fw/vhost.c | 52 +- install.txt | 42 + regex/Makefile | 82 + regex/alloc.c | 135 + regex/allocator.h | 66 + regex/build.sh | 17 + regex/crc32.h | 58 + regex/database.c | 474 ++++ regex/database.h | 142 + regex/dkms.conf | 8 + regex/fdr/fdr.c | 881 ++++++ regex/fdr/fdr.h | 85 + regex/fdr/fdr_confirm.h | 94 + regex/fdr/fdr_confirm_runtime.h | 104 + regex/fdr/fdr_internal.h | 105 + regex/fdr/fdr_loadval.h | 71 + regex/fdr/flood_runtime.h | 337 +++ regex/fdr/teddy.c | 1114 ++++++++ regex/fdr/teddy.h | 110 + regex/fdr/teddy_avx2.c | 709 +++++ regex/fdr/teddy_internal.h | 66 + regex/fdr/teddy_runtime_common.h | 459 ++++ regex/hs.h | 51 + regex/hs_common.h | 600 ++++ regex/hs_compile.h | 1224 +++++++++ regex/hs_internal.h | 89 + regex/hs_runtime.h | 683 +++++ regex/hs_version.c | 36 + regex/hwlm/hwlm.c | 247 ++ regex/hwlm/hwlm.h | 145 + regex/hwlm/hwlm_internal.h | 62 + regex/hwlm/noodle_engine.c | 447 +++ regex/hwlm/noodle_engine.h | 60 + regex/hwlm/noodle_engine_avx2.c | 244 ++ regex/hwlm/noodle_engine_avx512.c | 191 ++ regex/hwlm/noodle_engine_sse.c | 203 ++ regex/hwlm/noodle_internal.h | 51 + regex/kmod/.clang-format | 683 +++++ regex/kmod/config.h | 109 + regex/kmod/hs_version.h | 39 + regex/kmod/rex.c | 649 +++++ regex/kmod/rex.h | 82 + regex/kmod/rex_trace.h | 37 + regex/kmod/ue2common_kern.h | 106 + regex/nfa/accel.c | 146 + regex/nfa/accel.h | 128 + regex/nfa/callback.h | 72 + regex/nfa/castle.c | 1149 ++++++++ regex/nfa/castle.h | 65 + regex/nfa/castle_internal.h | 143 + regex/nfa/gough.c | 1147 ++++++++ regex/nfa/gough.h | 82 + regex/nfa/gough_internal.h | 134 + regex/nfa/lbr.c | 531 ++++ regex/nfa/lbr.h | 150 + regex/nfa/lbr_common_impl.h | 462 ++++ regex/nfa/lbr_internal.h | 82 + regex/nfa/limex.h | 91 + regex/nfa/limex_64.c | 73 + regex/nfa/limex_accel.c | 170 ++ regex/nfa/limex_accel.h | 79 + regex/nfa/limex_common_impl.h | 431 +++ regex/nfa/limex_context.h | 91 + regex/nfa/limex_exceptional.h | 401 +++ regex/nfa/limex_internal.h | 203 ++ regex/nfa/limex_limits.h | 35 + regex/nfa/limex_native.c | 129 + regex/nfa/limex_ring.h | 106 + regex/nfa/limex_runtime.h | 201 ++ regex/nfa/limex_runtime_impl.h | 1079 ++++++++ regex/nfa/limex_shuffle.h | 78 + regex/nfa/limex_simd128.c | 63 + regex/nfa/limex_simd256.c | 60 + regex/nfa/limex_simd384.c | 60 + regex/nfa/limex_simd512.c | 60 + regex/nfa/limex_state_impl.h | 145 + regex/nfa/mcclellan.c | 1350 +++++++++ regex/nfa/mcclellan.h | 109 + regex/nfa/mcclellan_common_impl.h | 189 ++ regex/nfa/mcclellan_internal.h | 164 ++ regex/nfa/mcsheng.c | 2742 ++++++++++++++++++ regex/nfa/mcsheng.h | 157 ++ regex/nfa/mcsheng_data.c | 55 + regex/nfa/mcsheng_internal.h | 124 + regex/nfa/mpv.c | 1100 ++++++++ regex/nfa/mpv.h | 60 + regex/nfa/mpv_internal.h | 197 ++ regex/nfa/nfa_api.h | 280 ++ regex/nfa/nfa_api_dispatch.c | 368 +++ regex/nfa/nfa_api_queue.h | 289 ++ regex/nfa/nfa_api_util.h | 82 + regex/nfa/nfa_internal.h | 266 ++ regex/nfa/nfa_rev_api.h | 157 ++ regex/nfa/repeat.c | 1611 +++++++++++ regex/nfa/repeat.h | 370 +++ regex/nfa/repeat_internal.h | 218 ++ regex/nfa/sheng.c | 1877 +++++++++++++ regex/nfa/sheng.h | 143 + regex/nfa/sheng_defs.h | 754 +++++ regex/nfa/sheng_impl.h | 221 ++ regex/nfa/sheng_impl4.h | 711 +++++ regex/nfa/sheng_internal.h | 107 + regex/nfa/shufti.c | 1097 ++++++++ regex/nfa/shufti.h | 61 + regex/nfa/tamarama.c | 441 +++ regex/nfa/tamarama.h | 70 + regex/nfa/tamarama_internal.h | 105 + regex/nfa/truffle.c | 608 ++++ regex/nfa/truffle.h | 57 + regex/nfa/vermicelli.h | 518 ++++ regex/nfa/vermicelli_run.h | 90 + regex/nfa/vermicelli_sse.h | 889 ++++++ regex/report.h | 392 +++ regex/rose/block.c | 422 +++ regex/rose/catchup.c | 900 ++++++ regex/rose/catchup.h | 207 ++ regex/rose/counting_miracle.h | 263 ++ regex/rose/infix.h | 161 ++ regex/rose/init.c | 92 + regex/rose/init.h | 46 + regex/rose/match.c | 632 +++++ regex/rose/match.h | 383 +++ regex/rose/miracle.h | 138 + regex/rose/program_runtime.c | 3509 ++++++++++++++++++++++++ regex/rose/program_runtime.h | 61 + regex/rose/rose.h | 62 + regex/rose/rose_common.h | 56 + regex/rose/rose_internal.h | 659 +++++ regex/rose/rose_program.h | 724 +++++ regex/rose/rose_types.h | 71 + regex/rose/runtime.h | 160 ++ regex/rose/stream.c | 752 +++++ regex/rose/stream_long_lit.h | 372 +++ regex/rose/stream_long_lit_hash.h | 105 + regex/rose/validate_mask.h | 154 ++ regex/rose/validate_shufti.h | 372 +++ regex/runtime.c | 1356 +++++++++ regex/scratch.c | 466 ++++ regex/scratch.h | 276 ++ regex/smallwrite/smallwrite_internal.h | 53 + regex/som/som_operation.h | 84 + regex/som/som_runtime.c | 535 ++++ regex/som/som_runtime.h | 67 + regex/som/som_stream.c | 174 ++ regex/som/som_stream.h | 48 + regex/state.h | 69 + regex/stream_compress.c | 134 + regex/stream_compress.h | 55 + regex/stream_compress_impl.h | 193 ++ regex/ue2common.h | 247 ++ regex/util/arch.h | 92 + regex/util/bitutils.h | 492 ++++ regex/util/compare.h | 183 ++ regex/util/copybytes.h | 113 + regex/util/cpuid_flags.c | 176 ++ regex/util/cpuid_flags.h | 55 + regex/util/cpuid_inline.h | 260 ++ regex/util/exhaust.h | 41 + regex/util/fatbit.h | 93 + regex/util/intrinsics.h | 69 + regex/util/join.h | 40 + regex/util/logical.h | 77 + regex/util/masked_move.c | 91 + regex/util/masked_move.h | 82 + regex/util/multibit.c | 140 + regex/util/multibit.h | 1506 ++++++++++ regex/util/multibit_compress.h | 204 ++ regex/util/multibit_internal.h | 81 + regex/util/pack_bits.h | 227 ++ regex/util/partial_store.h | 163 ++ regex/util/popcount.h | 74 + regex/util/pqueue.h | 109 + regex/util/scatter.h | 55 + regex/util/scatter_runtime.h | 74 + regex/util/simd_types.h | 57 + regex/util/simd_utils.c | 62 + regex/util/simd_utils.h | 1424 ++++++++++ regex/util/state_compress.c | 617 +++++ regex/util/state_compress.h | 68 + regex/util/unaligned.h | 98 + regex/util/uniform_ops.h | 243 ++ scripts/install_regex.sh | 56 + scripts/regex_start.sh | 28 + scripts/regex_stop.sh | 8 + scripts/tempesta.sh | 11 + 193 files changed, 58582 insertions(+), 24 deletions(-) create mode 100644 install.txt create mode 100644 regex/Makefile create mode 100644 regex/alloc.c create mode 100644 regex/allocator.h create mode 100755 regex/build.sh create mode 100644 regex/crc32.h create mode 100644 regex/database.c create mode 100644 regex/database.h create mode 100644 regex/dkms.conf create mode 100644 regex/fdr/fdr.c create mode 100644 regex/fdr/fdr.h create mode 100644 regex/fdr/fdr_confirm.h create mode 100644 regex/fdr/fdr_confirm_runtime.h create mode 100644 regex/fdr/fdr_internal.h create mode 100644 regex/fdr/fdr_loadval.h create mode 100644 regex/fdr/flood_runtime.h create mode 100644 regex/fdr/teddy.c create mode 100644 regex/fdr/teddy.h create mode 100644 regex/fdr/teddy_avx2.c create mode 100644 regex/fdr/teddy_internal.h create mode 100644 regex/fdr/teddy_runtime_common.h create mode 100644 regex/hs.h create mode 100644 regex/hs_common.h create mode 100644 regex/hs_compile.h create mode 100644 regex/hs_internal.h create mode 100644 regex/hs_runtime.h create mode 100644 regex/hs_version.c create mode 100644 regex/hwlm/hwlm.c create mode 100644 regex/hwlm/hwlm.h create mode 100644 regex/hwlm/hwlm_internal.h create mode 100644 regex/hwlm/noodle_engine.c create mode 100644 regex/hwlm/noodle_engine.h create mode 100644 regex/hwlm/noodle_engine_avx2.c create mode 100644 regex/hwlm/noodle_engine_avx512.c create mode 100644 regex/hwlm/noodle_engine_sse.c create mode 100644 regex/hwlm/noodle_internal.h create mode 100644 regex/kmod/.clang-format create mode 100644 regex/kmod/config.h create mode 100644 regex/kmod/hs_version.h create mode 100644 regex/kmod/rex.c create mode 100644 regex/kmod/rex.h create mode 100644 regex/kmod/rex_trace.h create mode 100644 regex/kmod/ue2common_kern.h create mode 100644 regex/nfa/accel.c create mode 100644 regex/nfa/accel.h create mode 100644 regex/nfa/callback.h create mode 100644 regex/nfa/castle.c create mode 100644 regex/nfa/castle.h create mode 100644 regex/nfa/castle_internal.h create mode 100644 regex/nfa/gough.c create mode 100644 regex/nfa/gough.h create mode 100644 regex/nfa/gough_internal.h create mode 100644 regex/nfa/lbr.c create mode 100644 regex/nfa/lbr.h create mode 100644 regex/nfa/lbr_common_impl.h create mode 100644 regex/nfa/lbr_internal.h create mode 100644 regex/nfa/limex.h create mode 100644 regex/nfa/limex_64.c create mode 100644 regex/nfa/limex_accel.c create mode 100644 regex/nfa/limex_accel.h create mode 100644 regex/nfa/limex_common_impl.h create mode 100644 regex/nfa/limex_context.h create mode 100644 regex/nfa/limex_exceptional.h create mode 100644 regex/nfa/limex_internal.h create mode 100644 regex/nfa/limex_limits.h create mode 100644 regex/nfa/limex_native.c create mode 100644 regex/nfa/limex_ring.h create mode 100644 regex/nfa/limex_runtime.h create mode 100644 regex/nfa/limex_runtime_impl.h create mode 100644 regex/nfa/limex_shuffle.h create mode 100644 regex/nfa/limex_simd128.c create mode 100644 regex/nfa/limex_simd256.c create mode 100644 regex/nfa/limex_simd384.c create mode 100644 regex/nfa/limex_simd512.c create mode 100644 regex/nfa/limex_state_impl.h create mode 100644 regex/nfa/mcclellan.c create mode 100644 regex/nfa/mcclellan.h create mode 100644 regex/nfa/mcclellan_common_impl.h create mode 100644 regex/nfa/mcclellan_internal.h create mode 100644 regex/nfa/mcsheng.c create mode 100644 regex/nfa/mcsheng.h create mode 100644 regex/nfa/mcsheng_data.c create mode 100644 regex/nfa/mcsheng_internal.h create mode 100644 regex/nfa/mpv.c create mode 100644 regex/nfa/mpv.h create mode 100644 regex/nfa/mpv_internal.h create mode 100644 regex/nfa/nfa_api.h create mode 100644 regex/nfa/nfa_api_dispatch.c create mode 100644 regex/nfa/nfa_api_queue.h create mode 100644 regex/nfa/nfa_api_util.h create mode 100644 regex/nfa/nfa_internal.h create mode 100644 regex/nfa/nfa_rev_api.h create mode 100644 regex/nfa/repeat.c create mode 100644 regex/nfa/repeat.h create mode 100644 regex/nfa/repeat_internal.h create mode 100644 regex/nfa/sheng.c create mode 100644 regex/nfa/sheng.h create mode 100644 regex/nfa/sheng_defs.h create mode 100644 regex/nfa/sheng_impl.h create mode 100644 regex/nfa/sheng_impl4.h create mode 100644 regex/nfa/sheng_internal.h create mode 100644 regex/nfa/shufti.c create mode 100644 regex/nfa/shufti.h create mode 100644 regex/nfa/tamarama.c create mode 100644 regex/nfa/tamarama.h create mode 100644 regex/nfa/tamarama_internal.h create mode 100644 regex/nfa/truffle.c create mode 100644 regex/nfa/truffle.h create mode 100644 regex/nfa/vermicelli.h create mode 100644 regex/nfa/vermicelli_run.h create mode 100644 regex/nfa/vermicelli_sse.h create mode 100644 regex/report.h create mode 100644 regex/rose/block.c create mode 100644 regex/rose/catchup.c create mode 100644 regex/rose/catchup.h create mode 100644 regex/rose/counting_miracle.h create mode 100644 regex/rose/infix.h create mode 100644 regex/rose/init.c create mode 100644 regex/rose/init.h create mode 100644 regex/rose/match.c create mode 100644 regex/rose/match.h create mode 100644 regex/rose/miracle.h create mode 100644 regex/rose/program_runtime.c create mode 100644 regex/rose/program_runtime.h create mode 100644 regex/rose/rose.h create mode 100644 regex/rose/rose_common.h create mode 100644 regex/rose/rose_internal.h create mode 100644 regex/rose/rose_program.h create mode 100644 regex/rose/rose_types.h create mode 100644 regex/rose/runtime.h create mode 100644 regex/rose/stream.c create mode 100644 regex/rose/stream_long_lit.h create mode 100644 regex/rose/stream_long_lit_hash.h create mode 100644 regex/rose/validate_mask.h create mode 100644 regex/rose/validate_shufti.h create mode 100644 regex/runtime.c create mode 100644 regex/scratch.c create mode 100644 regex/scratch.h create mode 100644 regex/smallwrite/smallwrite_internal.h create mode 100644 regex/som/som_operation.h create mode 100644 regex/som/som_runtime.c create mode 100644 regex/som/som_runtime.h create mode 100644 regex/som/som_stream.c create mode 100644 regex/som/som_stream.h create mode 100644 regex/state.h create mode 100644 regex/stream_compress.c create mode 100644 regex/stream_compress.h create mode 100644 regex/stream_compress_impl.h create mode 100644 regex/ue2common.h create mode 100644 regex/util/arch.h create mode 100644 regex/util/bitutils.h create mode 100644 regex/util/compare.h create mode 100644 regex/util/copybytes.h create mode 100644 regex/util/cpuid_flags.c create mode 100644 regex/util/cpuid_flags.h create mode 100644 regex/util/cpuid_inline.h create mode 100644 regex/util/exhaust.h create mode 100644 regex/util/fatbit.h create mode 100644 regex/util/intrinsics.h create mode 100644 regex/util/join.h create mode 100644 regex/util/logical.h create mode 100644 regex/util/masked_move.c create mode 100644 regex/util/masked_move.h create mode 100644 regex/util/multibit.c create mode 100644 regex/util/multibit.h create mode 100644 regex/util/multibit_compress.h create mode 100644 regex/util/multibit_internal.h create mode 100644 regex/util/pack_bits.h create mode 100644 regex/util/partial_store.h create mode 100644 regex/util/popcount.h create mode 100644 regex/util/pqueue.h create mode 100644 regex/util/scatter.h create mode 100644 regex/util/scatter_runtime.h create mode 100644 regex/util/simd_types.h create mode 100644 regex/util/simd_utils.c create mode 100644 regex/util/simd_utils.h create mode 100644 regex/util/state_compress.c create mode 100644 regex/util/state_compress.h create mode 100644 regex/util/unaligned.h create mode 100644 regex/util/uniform_ops.h create mode 100755 scripts/install_regex.sh create mode 100755 scripts/regex_start.sh create mode 100755 scripts/regex_stop.sh diff --git a/Makefile b/Makefile index c23a2aae4..da11e2e49 100644 --- a/Makefile +++ b/Makefile @@ -146,7 +146,7 @@ KERNEL = /lib/modules/$(shell uname -r)/build export KERNEL TFW_CFLAGS AVX2 BMI2 ADX TFW_GCOV -obj-m += lib/ db/core/ fw/ tls/ +obj-m += lib/ db/core/ regex/ fw/ tls/ all: build diff --git a/fw/cfg.c b/fw/cfg.c index 44e47a3e6..6ec5c3e3a 100644 --- a/fw/cfg.c +++ b/fw/cfg.c @@ -109,6 +109,9 @@ * them. Helpers below facilitate that. */ +unsigned short number_of_regex = 0; +unsigned short number_of_db_regex = 0; + static const char * __alloc_and_copy_literal(const char *src, size_t len, bool keep_bs) { @@ -397,6 +400,9 @@ typedef enum { TOKEN_SEMICOLON, TOKEN_LITERAL, TOKEN_ARROW, + TOKEN_TILDA, + TOKEN_REGEX, + TOKEN_REGEX_CI, _TOKEN_COUNT, } token_t; @@ -588,9 +594,12 @@ read_next_token(TfwCfgParserState *ps) TOKEN_NEQSIGN); TFSM_COND_MOVE_EXIT(ps->c == '>' && ps->prev_c == '-', TOKEN_ARROW); + TFSM_COND_MOVE_EXIT(ps->c == '*' && ps->prev_c == '~', + TOKEN_REGEX_CI); /* Special case to differ single equal sign from double one. */ TFSM_COND_MOVE(ps->c == '=', TS_EQSIGN); + TFSM_COND_MOVE(ps->c == '~', TS_TILDA); /* Everything else is not a special character and therefore * it starts a literal. */ @@ -619,6 +628,14 @@ read_next_token(TfwCfgParserState *ps) TFSM_JMP_EXIT(TOKEN_EQSIGN); } + FSM_STATE(TS_TILDA) { + TFSM_COND_JMP_EXIT(!ps->c, TOKEN_REGEX); + + /* If this is double equal sign, eat second sign and exit. */ + TFSM_COND_MOVE_EXIT(ps->c == '*', TOKEN_REGEX_CI); + TFSM_JMP_EXIT(TOKEN_REGEX); + } + FSM_STATE(TS_COMMENT) { TFSM_COND_JMP_EXIT(!ps->c, TOKEN_NA); @@ -732,7 +749,21 @@ entry_set_cond(TfwCfgEntry *e, token_t cond_type, const char *src, int len) if (!(e->name = alloc_and_copy_literal(name, name_len))) return -ENOMEM; - rule->inv = cond_type == TOKEN_DEQSIGN ? false : true; + switch (cond_type) { + case TOKEN_REGEX: + rule->regex = TFW_REGEX_REGULAR; + rule->inv = false; + break; + case TOKEN_REGEX_CI: + rule->regex = TFW_REGEX_CI; + rule->inv = false; + break; + default: + rule->regex = TFW_REGEX_NO; + rule->inv = cond_type == TOKEN_DEQSIGN ? false : true; + break; + } + return 0; } @@ -806,8 +837,10 @@ parse_cfg_entry(TfwCfgParserState *ps) FSM_STATE(PS_PLAIN_OR_RULE) { PFSM_COND_MOVE(ps->t == TOKEN_DEQSIGN || - ps->t == TOKEN_NEQSIGN, - PS_RULE_COND); + ps->t == TOKEN_NEQSIGN || + ps->t == TOKEN_REGEX || + ps->t == TOKEN_REGEX_CI, + PS_RULE_COND); PFSM_COND_MOVE(ps->t == TOKEN_LITERAL, PS_PLAIN_OR_LONG_RULE); /* Jump to plain val/attr scheme to make remained checks @@ -819,8 +852,10 @@ parse_cfg_entry(TfwCfgParserState *ps) FSM_STATE(PS_PLAIN_OR_LONG_RULE) { FSM_COND_JMP(ps->t == TOKEN_DEQSIGN || - ps->t == TOKEN_NEQSIGN, - PS_LONG_RULE_COND); + ps->t == TOKEN_NEQSIGN || + ps->t == TOKEN_REGEX || + ps->t == TOKEN_REGEX_CI, + PS_LONG_RULE_COND); /* This is not rule (simple or extended), so jump to * plain val/attr scheme. */ @@ -828,9 +863,9 @@ parse_cfg_entry(TfwCfgParserState *ps) FSM_COND_JMP(ps->err, PS_EXIT); FSM_COND_JMP(ps->t == TOKEN_EQSIGN, PS_STORE_ATTR_PREV); FSM_COND_JMP(ps->t == TOKEN_LITERAL || - ps->t == TOKEN_SEMICOLON || - ps->t == TOKEN_LBRACE, - PS_STORE_VAL_PREV); + ps->t == TOKEN_SEMICOLON || + ps->t == TOKEN_LBRACE, + PS_STORE_VAL_PREV); ps->err = -EINVAL; FSM_JMP(PS_EXIT); @@ -838,16 +873,20 @@ parse_cfg_entry(TfwCfgParserState *ps) FSM_STATE(PS_LONG_RULE_COND) { ps->err = entry_add_rule_param(&ps->e.rule.fst_ext, - ps->prev_lit, - ps->prev_lit_len); + ps->prev_lit, + ps->prev_lit_len); FSM_COND_JMP(ps->err, PS_EXIT); PFSM_MOVE(PS_RULE_COND); } FSM_STATE(PS_RULE_COND) { + FSM_COND_JMP(ps->prev_t == TOKEN_REGEX || + ps->prev_t == TOKEN_REGEX_CI, + PS_STORE_VAL_PREV_REGEX); + PFSM_COND_JMP_EXIT_ERROR(ps->t != TOKEN_LITERAL); ps->err = entry_set_cond(&ps->e, ps->prev_t, ps->lit, - ps->lit_len); + ps->lit_len); FSM_COND_JMP(ps->err, PS_EXIT); PFSM_MOVE(PS_RULE_COND_END); } @@ -866,7 +905,7 @@ parse_cfg_entry(TfwCfgParserState *ps) FSM_STATE(PS_RULE_ACTION) { PFSM_COND_JMP_EXIT_ERROR(ps->t != TOKEN_LITERAL); ps->err = entry_add_rule_param(&ps->e.rule.act, ps->lit, - ps->lit_len); + ps->lit_len); FSM_COND_JMP(ps->err, PS_EXIT); PFSM_MOVE(PS_RULE_ACTION_VAL); } @@ -878,7 +917,7 @@ parse_cfg_entry(TfwCfgParserState *ps) PFSM_COND_JMP_EXIT_ERROR(ps->t != TOKEN_LITERAL); ps->err = entry_add_rule_param(&ps->e.rule.val, ps->lit, - ps->lit_len); + ps->lit_len); FSM_COND_JMP(ps->err, PS_EXIT); read_next_token(ps); @@ -914,6 +953,38 @@ parse_cfg_entry(TfwCfgParserState *ps) FSM_JMP(PS_STORE_VAL_PREV); } + FSM_STATE(PS_STORE_VAL_PREV_REGEX) { + /* name val1 val2; + * ^ + * We are here (but still need to store val1) + * and name or condition. + */ + T_DBG3("add value: %.*s\n", ps->prev_lit_len, ps->prev_lit); + + if (ps->e.ftoken && !strcmp(ps->e.ftoken, "location")) { + ps->err = entry_set_name(&ps->e); + + if (!ps->err) { + if (ps->prev_t == TOKEN_REGEX) + ps->err = entry_add_val(&ps->e, "regex", + sizeof("regex")); + if (ps->prev_t == TOKEN_REGEX_CI) + ps->err = entry_add_val(&ps->e, + "regex_ci", + sizeof("regex_ci")); + } + FSM_COND_JMP(ps->err, PS_EXIT); + FSM_JMP(PS_VAL_OR_ATTR); + } + + /*If it is not location*/ + ps->err = entry_set_cond(&ps->e, ps->prev_t, + ps->lit, ps->lit_len); + FSM_COND_JMP(ps->err, PS_EXIT); + PFSM_MOVE(PS_RULE_COND_END); + + } + FSM_STATE(PS_STORE_VAL_PREV) { /* name val1 val2; * ^ diff --git a/fw/cfg.h b/fw/cfg.h index b8f672d16..2978f7c12 100644 --- a/fw/cfg.h +++ b/fw/cfg.h @@ -149,6 +149,7 @@ typedef struct { const char *act; const char *val; bool inv; + int regex; } TfwCfgRule; typedef struct { @@ -336,6 +337,9 @@ struct TfwCfgSpec { void (*cleanup)(TfwCfgSpec *self); }; +extern unsigned short number_of_regex; +extern unsigned short number_of_db_regex; + /** * Walks over a NULL-terminated array of TfwCfgSpec structures. */ @@ -412,6 +416,12 @@ enum { TFW_CFG_B_KEEP, /* Keep an entry */ }; +enum { + TFW_REGEX_NO = 0, + TFW_REGEX_REGULAR, + TFW_REGEX_CI, +}; + #define TFW_CFG_F_ADD (1 << TFW_CFG_B_ADD) #define TFW_CFG_F_DEL (1 << TFW_CFG_B_DEL) #define TFW_CFG_F_MOD (1 << TFW_CFG_B_MOD) diff --git a/fw/http_match.c b/fw/http_match.c index 67ae151c6..9b13d91b0 100644 --- a/fw/http_match.c +++ b/fw/http_match.c @@ -71,6 +71,7 @@ #include "http_match.h" #include "http_msg.h" #include "cfg.h" +#include "regex/kmod/rex.h" /** * Map an operator to that flags passed to tfw_str_eq_*() functions. @@ -83,11 +84,34 @@ map_op_to_str_eq_flags(tfw_http_match_op_t op) [TFW_HTTP_MATCH_O_EQ] = TFW_STR_EQ_DEFAULT, [TFW_HTTP_MATCH_O_PREFIX] = TFW_STR_EQ_PREFIX, [TFW_HTTP_MATCH_O_SUFFIX] = TFW_STR_EQ_DEFAULT, + [TFW_HTTP_MATCH_O_REGEX] = TFW_STR_EQ_REGEX, + [TFW_HTTP_MATCH_O_REGEX_CI] = TFW_STR_EQ_REGEX_CASEI, }; BUG_ON(flags_tbl[op] < 0); return flags_tbl[op]; } +//extern int bpf_scan_bytes(const void *, __u32, struct rex_scan_attr *); + +extern int bpf_scan_tfwstr(const TfwStr *str, struct rex_scan_attr *attr); + +bool +tfw_match_regex(tfw_match_t op, const char *cstr, size_t len, const TfwStr *arg) +{ + bool result; + int r; + + struct rex_scan_attr attr = {}; + memcpy(&attr.database_id, cstr, sizeof(unsigned short)); + + if (!arg->len) + return false; + + r = bpf_scan_tfwstr(arg, &attr); + result = (!r && attr.nr_events && attr.last_event.expression); + return result; +} + static bool tfw_rule_str_match(const TfwStr *str, const char *cstr, int cstr_len, tfw_str_eq_flags_t flags, @@ -97,6 +121,9 @@ tfw_rule_str_match(const TfwStr *str, const char *cstr, return tfw_str_eq_cstr_off(str, str->len - cstr_len, cstr, cstr_len, flags); + if (op == TFW_HTTP_MATCH_O_REGEX) + return tfw_match_regex(op, cstr, cstr_len, str); + return tfw_str_eq_cstr(str, cstr, cstr_len, flags); } @@ -706,10 +733,93 @@ tfw_http_escape_pre_post(char *out , const char *str) return len; } +/* + * Here we create text file for every regex string which + * can be readed by hscollider. + * Next hscollider compile it and save to temporary DB. + * After it will be loaded to regex module DB. + * All operations after creating will be done in script start_regex.sh + * + * As it potentially possible situation then one DB conains several + * expressions, here are two variables: + * number_of_db_regex - nomber of databes which we will use to look for + * expression; + * number_of_regex - number of expression to know wich exactly expression + * was matched (parsing for it has not not implemented yet) + * + * After this function, number_of_db_regex will be written to start of arg, + * so the lenght of regex string must be longer then two bytes. + * + * Directory /tmp/tempesata is created from + * tempesta.sh script. + */ +int +write_regex(const char *arg, int regex) +{ + struct file *fl; + loff_t off = 0; + int r; + char file_name[25]; + char reg_number[6]; + int len = strlen(arg); + int len1; + + if (len < sizeof(unsigned short)) { + T_ERR_NL("String of regex too short\n"); + return -EINVAL; + } + + ++number_of_db_regex; + sprintf(file_name, "/tmp/tempesta/%u.txt", number_of_db_regex); + + fl = filp_open(file_name, O_CREAT | O_WRONLY, 0600); + if (IS_ERR(fl)) { + T_ERR_NL("Cannot create regex file %s\n", + file_name); + return -EINVAL; + } + BUG_ON(!fl || !fl->f_path.dentry); + + if (!fl->f_op->fallocate) { + T_ERR_NL("File requires filesystem with fallocate support\n"); + filp_close(fl, NULL); + return -EINVAL; + } + + ++number_of_regex; + sprintf(reg_number, "%i:", number_of_regex); + len1 = strlen(reg_number); + r = kernel_write(fl, (void *)reg_number, len1, &off); + if (r != len1) + goto err; + + r = kernel_write(fl, (void *)arg, len, &off); + if (r != len) + goto err; + + if (regex == TFW_REGEX_CI) { + r = kernel_write(fl, "i", 1, &off); + if (r != 1) + goto err; + } + + r = kernel_write(fl, "\n", 1, &off); + if (r != 1) + goto err; + + filp_close(fl, NULL); + return 0; +err: + T_ERR_NL("Cannot write regex\n"); + filp_close(fl, NULL); + return r; +} + const char * tfw_http_arg_adjust(const char *arg, tfw_http_match_fld_t field, - const char *raw_hdr_name, size_t *size_out, - tfw_http_match_arg_t *type_out, + const char *raw_hdr_name, int regex, + size_t *size_out, + tfw_http_match_arg_t *type_out, tfw_http_match_op_t *op_out) { char *arg_out, *pos; @@ -751,6 +861,11 @@ tfw_http_arg_adjust(const char *arg, tfw_http_match_fld_t field, if (wc_arg || (len > 1 && arg[len - 1] == '*' && arg[len - 2] != '\\')) *op_out = TFW_HTTP_MATCH_O_PREFIX; + if (!wc_arg && regex) { + *op_out = TFW_HTTP_MATCH_O_REGEX; + write_regex(arg, regex); + } + /* * For argument started with wildcard, the suffix matching * pattern should be applied. @@ -779,6 +894,12 @@ tfw_http_arg_adjust(const char *arg, tfw_http_match_fld_t field, len = tfw_http_escape_pre_post(pos, arg); *size_out += full_name_len + len + 1; + /* + * Save number_of_db_regex to use it in tfw_match_regex + */ + if (*op_out == TFW_HTTP_MATCH_O_REGEX) + memcpy(arg_out, &number_of_db_regex, sizeof(number_of_db_regex)); + return arg_out; } diff --git a/fw/http_match.h b/fw/http_match.h index 3da5b3df0..53c5d27f5 100644 --- a/fw/http_match.h +++ b/fw/http_match.h @@ -52,6 +52,8 @@ typedef enum { TFW_HTTP_MATCH_O_EQ, TFW_HTTP_MATCH_O_PREFIX, TFW_HTTP_MATCH_O_SUFFIX, + TFW_HTTP_MATCH_O_REGEX, + TFW_HTTP_MATCH_O_REGEX_CI,/*case insensitive*/ _TFW_HTTP_MATCH_O_COUNT } tfw_http_match_op_t; @@ -156,7 +158,8 @@ TfwHttpMatchRule *tfw_http_rule_new(TfwHttpChain *chain, int tfw_http_rule_arg_init(TfwHttpMatchRule *rule, const char *arg, size_t arg_len); const char *tfw_http_arg_adjust(const char *arg, tfw_http_match_fld_t field, - const char *raw_hdr_name, size_t *size_out, + const char *raw_hdr_name, int regex, + size_t *size_out, tfw_http_match_arg_t *type_out, tfw_http_match_op_t *op_out); const char *tfw_http_val_adjust(const char *val, tfw_http_match_fld_t field, @@ -170,6 +173,11 @@ int tfw_http_search_cookie(const char *cstr, unsigned long clen, TfwStr **pos, TfwStr *end, TfwStr *val, tfw_http_match_op_t op, bool is_resp_hdr); +int write_regex(const char *arg, int regex); + +bool tfw_match_regex(tfw_match_t op, const char *cstr, size_t len, + const TfwStr *arg); + #define tfw_http_chain_rules_for_each(chain, func) \ ({ \ int r = 0; \ diff --git a/fw/http_tbl.c b/fw/http_tbl.c index cf9a9ef8b..5abf30798 100644 --- a/fw/http_tbl.c +++ b/fw/http_tbl.c @@ -382,7 +382,7 @@ tfw_cfgop_http_rule(TfwCfgSpec *cs, TfwCfgEntry *e) const char *in_field, *in_field_val, *action, *action_val, *in_arg, *arg = NULL, *val = NULL; unsigned int invert, hid = TFW_HTTP_HDR_RAW, - act_val_parsed, val_len; + act_val_parsed, val_len, regex; tfw_http_match_op_t op = TFW_HTTP_MATCH_O_WILDCARD, op_val = TFW_HTTP_MATCH_O_WILDCARD; tfw_http_match_fld_t field = TFW_HTTP_MATCH_F_WILDCARD; @@ -398,6 +398,7 @@ tfw_cfgop_http_rule(TfwCfgSpec *cs, TfwCfgEntry *e) TFW_CFG_CHECK_NO_ATTRS(cs, e); invert = cfg_rule->inv; + regex = cfg_rule->regex; in_field = cfg_rule->fst; in_field_val = cfg_rule->fst_ext; in_arg = cfg_rule->snd; @@ -432,7 +433,8 @@ tfw_cfgop_http_rule(TfwCfgSpec *cs, TfwCfgEntry *e) } arg = tfw_http_arg_adjust(in_arg, field, in_field_val, - &arg_size, &type, &op); + cfg_rule->regex, &arg_size, + &type, &op); if (IS_ERR(arg)) return PTR_ERR(arg); } diff --git a/fw/str.h b/fw/str.h index 136860fd2..27fda9d64 100644 --- a/fw/str.h +++ b/fw/str.h @@ -437,6 +437,8 @@ typedef enum { TFW_STR_EQ_PREFIX = 0x1, TFW_STR_EQ_CASEI = 0x2, TFW_STR_EQ_PREFIX_CASEI = (TFW_STR_EQ_PREFIX | TFW_STR_EQ_CASEI), + TFW_STR_EQ_REGEX = 0x4, + TFW_STR_EQ_REGEX_CASEI = (TFW_STR_EQ_REGEX | TFW_STR_EQ_CASEI), } tfw_str_eq_flags_t; int tfw_strcpy(TfwStr *dst, const TfwStr *src); diff --git a/fw/t/unit/test_http_match.c b/fw/t/unit/test_http_match.c index ec8cb88c0..9da170ad2 100644 --- a/fw/t/unit/test_http_match.c +++ b/fw/t/unit/test_http_match.c @@ -139,7 +139,8 @@ test_chain_add_rule_str(int test_id, tfw_http_match_fld_t field, tfw_http_verify_hdr_field(field, &in_val, &hid); } val = tfw_http_val_adjust(in_val, field, &val_len, &val_type, &op_val); - arg = tfw_http_arg_adjust(in_arg, field, in_val, &arg_size, &type, &op); + arg = tfw_http_arg_adjust(in_arg, field, in_val, 0, + &arg_size, &type, &op); EXPECT_NOT_NULL(arg); if (!arg) return; diff --git a/fw/vhost.c b/fw/vhost.c index eefd2ce41..19a967283 100644 --- a/fw/vhost.c +++ b/fw/vhost.c @@ -39,6 +39,7 @@ #include "http_sess.h" #include "client.h" #include "tls_conf.h" +#include "regex/kmod/rex.h" /* * The hash table entry for mapping @sni to @vhost for SAN certificates handling. @@ -75,6 +76,10 @@ static const TfwCfgEnum tfw_match_enum[] = { { "eq", TFW_HTTP_MATCH_O_EQ }, { "prefix", TFW_HTTP_MATCH_O_PREFIX }, { "suffix", TFW_HTTP_MATCH_O_SUFFIX }, + /*regex case sensitive*/ + { "regex", TFW_HTTP_MATCH_O_REGEX }, + /*regex* case insensitive*/ + { "regex_ci", TFW_HTTP_MATCH_O_REGEX_CI }, { 0 } }; @@ -177,6 +182,14 @@ __tfw_match_prefix(tfw_match_t op, const char *cstr, size_t len, TfwStr *arg) return tfw_str_eq_cstr(arg, cstr, len, flags); } +extern int bpf_scan_bytes(const void *, __u32, struct rex_scan_attr *); + +static bool +__tfw_match_regex(tfw_match_t op, const char *cstr, size_t len, TfwStr *arg) +{ + return tfw_match_regex(op, cstr, len, arg); +} + typedef bool (*__tfw_match_fn)(tfw_match_t, const char *, size_t, TfwStr *); static const __tfw_match_fn __tfw_match_fn_tbl[] = { @@ -185,6 +198,8 @@ static const __tfw_match_fn __tfw_match_fn_tbl[] = { [TFW_HTTP_MATCH_O_EQ] = __tfw_match_eq, [TFW_HTTP_MATCH_O_PREFIX] = __tfw_match_prefix, [TFW_HTTP_MATCH_O_SUFFIX] = __tfw_match_suffix, + [TFW_HTTP_MATCH_O_REGEX] = __tfw_match_regex, + [TFW_HTTP_MATCH_O_REGEX_CI] = __tfw_match_regex, }; /* @@ -1290,8 +1305,15 @@ tfw_location_init(TfwLocation *loc, tfw_match_t op, const char *arg, + sizeof(TfwHdrModsDesc) * TFW_USRHDRS_ARRAY_SZ * 2 + sizeof(TfwHdrModsDesc *) * TFW_HTTP_HDR_RAW * 2; - if ((argmem = kmalloc(len + 1, GFP_KERNEL)) == NULL) - return -ENOMEM; + if (op != TFW_HTTP_MATCH_O_REGEX) { + if ((argmem = kmalloc(len + 1, GFP_KERNEL)) == NULL) + return -ENOMEM; + } + else {/*If it is a regex we need only number of DB*/ + if ((argmem = kmalloc(2 + 1, GFP_KERNEL)) == NULL) + return -ENOMEM; + } + if ((data = kzalloc(size, GFP_KERNEL)) == NULL) { kfree(argmem); return -ENOMEM; @@ -1325,7 +1347,27 @@ tfw_location_init(TfwLocation *loc, tfw_match_t op, const char *arg, (TfwHdrModsDesc **)(loc->mod_hdrs[TFW_VHOST_HDRMOD_RESP].hdrs + TFW_USRHDRS_ARRAY_SZ); - memcpy((void *)loc->arg, (void *)arg, len + 1); + switch (op) { + case TFW_HTTP_MATCH_O_REGEX: + write_regex(arg, TFW_REGEX_REGULAR); + /* + * Save number_of_db_regex to use it in tfw_match_regex + */ + memcpy((void *)loc->arg, (void *)&number_of_db_regex, + sizeof(number_of_db_regex)); + break; + case TFW_HTTP_MATCH_O_REGEX_CI: + write_regex(arg, TFW_REGEX_CI); + /* + * Save number_of_db_regex to use it in tfw_match_regex + */ + memcpy((void *)loc->arg, (void *)&number_of_db_regex, + sizeof(number_of_db_regex)); + break; + default: + memcpy((void *)loc->arg, (void *)arg, len + 1); + break; + } return 0; } @@ -1344,7 +1386,6 @@ tfw_location_new(TfwVhost *vhost, tfw_match_t op, const char *arg, size_t len) if (tfw_location_init(loc, op, arg, len, vhost->hdrs_pool)) return NULL; vhost->loc_sz++; - if (tfw_frang_cfg_inherit(loc->frang_cfg, vhost->loc_dflt->frang_cfg)) return NULL; @@ -2351,6 +2392,9 @@ tfw_vhost_cfgstart(void) { TfwVhost *vh_dflt; + number_of_regex = 0; + number_of_db_regex = 0; + BUG_ON(tfw_vhosts_reconfig); tfw_vhosts_reconfig = kmalloc(sizeof(TfwVhostList), GFP_KERNEL); if (!tfw_vhosts_reconfig) { diff --git a/install.txt b/install.txt new file mode 100644 index 000000000..0199af594 --- /dev/null +++ b/install.txt @@ -0,0 +1,42 @@ +Colm (Colm Programming Language) +git clone https://github.com/adrian-thurston/colm.git + +$ ./autogen.sh +$ ./configure +$ make +$ make install + +add LD_LIBRARY_PATH="/usr/local/lib" to /etc/environment + + +Regal +git clone https://github.com/adrian-thurston/ragel.git + +$ ./autogen.sh +$ ./configure --with-colm=/usr/local +$ make +$ make install + + +PCRE +download PCRE from sourceforge +wget https://sourceforge.net/projects/pcre/files/pcre/8.45/pcre-8.45.tar.gz +tar -xf archive.tar.gz + +$ ./configure --enable-pcre16 --enable-pcre32 +$ make +$ make install + + + +git clone https://github.com/tempesta-tech/linux-regex-module.git + +cmake ./ +make + +after compilation +copy hscollider from /linux-regex-module/bin/ to /tempesta/scripts/ or default app directory??? + +git clone https://github.com/tempesta-tech/tempesta.git +cd tempesta +git checkout ag_Multi-pattern-regular-expressions diff --git a/regex/Makefile b/regex/Makefile new file mode 100644 index 000000000..3d5041566 --- /dev/null +++ b/regex/Makefile @@ -0,0 +1,82 @@ +obj-m := xdp_rex.o + +CC_FLAGS_HSRUNTIME := -isystem $(shell $(CC) -print-file-name=include) +CC_FLAGS_REMOVE_SIMD := -mno-80387 -mno-fp-ret-in-387 -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx +CC_FLAGS_HSRUNTIME += -DHAVE_SSE2 +CC_FLAGS_SIMD := -msse4.2 -msse4.1 +CC_FLAGS_HSRUNTIME += -DHAVE_SSE41 -DHAVE_SSE42 +CC_FLAGS_SIMD += -mavx -mavx2 +CC_FLAGS_HSRUNTIME += -DHAVE_AVX -DHAVE_AVX2 +#CC_FLAGS_SIMD += -mavx512f -mavx512cd -mavx512bw -mavx512vl -mavx512vnni +#CC_FLAGS_HSRUNTIME += -DHAVE_AVX512 +#CC_FLAGS_SIMD += -mavx512vbmi -mavx512vbmi2 -mavx512vnni +#CC_FLAGS_HSRUNTIME += -DHAVE_AVX512VBMI + +CC_FLAGS_HSRUNTIME += $(CC_FLAGS_SIMD) +CC_FLAGS_REMOVE_HSRUNTIME := $(CC_FLAGS_REMOVE_SIMD) +CC_FLAGS_REMOVE_HSRUNTIME += -Wdeclaration-after-statement +CC_FLAGS_HSRUNTIME += -Wframe-larger-than=2048 +CC_FLAGS_HSRUNTIME += -std=gnu11 +CC_FLAGS_REMOVE_HSRUNTIME += -std=gnu99 + +ccflags-y += -std=c99 +ccflags-y += -I$(src) -I$(src)/kmod -I$(src)/../ +ccflags-y += $(CC_FLAGS_HSRUNTIME) +ccflags-remove-y += $(CC_FLAGS_REMOVE_HSRUNTIME) + +CFLAGS_kmod/rex.o := $(CC_FLAGS_REMOVE_HSRUNTIME) +CFLAGS_REMOVE_kmod/rex.o := $(CC_FLAGS_HSRUNTIME) +CFLAGS_alloc.o := $(CC_FLAGS_REMOVE_SIMD) +CFLAGS_REMOVE_alloc.o := $(CC_FLAGS_SIMD) +CFLAGS_scratch.o := $(CC_FLAGS_REMOVE_SIMD) +CFLAGS_REMOVE_scratch.o := $(CC_FLAGS_SIMD) +CFLAGS_database.o := $(CC_FLAGS_REMOVE_SIMD) +CFLAGS_REMOVE_database.o := $(CC_FLAGS_SIMD) + +xdp_rex-m := kmod/rex.o \ + alloc.o \ + scratch.o \ + runtime.o \ + database.o \ + hs_version.o \ + stream_compress.o \ + fdr/fdr.o \ + fdr/teddy_avx2.o \ + fdr/teddy.o \ + hwlm/hwlm.o \ + hwlm/noodle_engine.o \ + nfa/accel.o \ + nfa/castle.o \ + nfa/gough.o \ + nfa/lbr.o \ + nfa/limex_64.o \ + nfa/limex_accel.o \ + nfa/limex_native.o \ + nfa/limex_simd128.o \ + nfa/limex_simd256.o \ + nfa/limex_simd384.o \ + nfa/limex_simd512.o \ + nfa/mcclellan.o \ + nfa/mcsheng.o \ + nfa/mcsheng_data.o \ + nfa/mpv.o \ + nfa/nfa_api_dispatch.o \ + nfa/repeat.o \ + nfa/sheng.o \ + nfa/shufti.o \ + nfa/tamarama.o \ + nfa/truffle.o \ + rose/block.o \ + rose/catchup.o \ + rose/init.o \ + rose/match.o \ + rose/program_runtime.o \ + rose/stream.o \ + som/som_runtime.o \ + som/som_stream.o \ + util/cpuid_flags.o \ + util/masked_move.o \ + util/multibit.o \ + util/simd_utils.o \ + util/state_compress.o \ +# diff --git a/regex/alloc.c b/regex/alloc.c new file mode 100644 index 000000000..27c9111fb --- /dev/null +++ b/regex/alloc.c @@ -0,0 +1,135 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Runtime functions for setting custom allocators. + */ + +#ifndef __KERNEL__ +#include +#include +#else +#include +#include +#include +#endif + +#include "allocator.h" + +#if !defined(__KERNEL__) + +#define default_malloc malloc +#define default_free free + +#else + +static void *default_malloc(size_t size) { + WARN_ON_ONCE(in_serving_softirq()); + return kmalloc(size, GFP_KERNEL); +} + +static void default_free(void *ptr) { + WARN_ON_ONCE(in_serving_softirq()); + return kfree(ptr); +} + +#endif + +hs_alloc_t hs_database_alloc = default_malloc; +hs_alloc_t hs_misc_alloc = default_malloc; +hs_alloc_t hs_scratch_alloc = default_malloc; +hs_alloc_t hs_stream_alloc = default_malloc; + +hs_free_t hs_database_free = default_free; +hs_free_t hs_misc_free = default_free; +hs_free_t hs_scratch_free = default_free; +hs_free_t hs_stream_free = default_free; + +static +hs_alloc_t normalise_alloc(hs_alloc_t a) { + if (!a) { + return default_malloc; + } else { + return a; + } +} + +static +hs_free_t normalise_free(hs_free_t f) { + if (!f) { + return default_free; + } else { + return f; + } +} + +HS_PUBLIC_API +hs_error_t HS_CDECL hs_set_allocator(hs_alloc_t allocfunc, hs_free_t freefunc) { + hs_set_database_allocator(allocfunc, freefunc); + hs_set_misc_allocator(allocfunc, freefunc); + hs_set_stream_allocator(allocfunc, freefunc); + hs_set_scratch_allocator(allocfunc, freefunc); + + return HS_SUCCESS; +} + +HS_PUBLIC_API +hs_error_t HS_CDECL hs_set_database_allocator(hs_alloc_t allocfunc, + hs_free_t freefunc) { + hs_database_alloc = normalise_alloc(allocfunc); + hs_database_free = normalise_free(freefunc); + + return HS_SUCCESS; +} + +HS_PUBLIC_API +hs_error_t HS_CDECL hs_set_misc_allocator(hs_alloc_t allocfunc, + hs_free_t freefunc) { + hs_misc_alloc = normalise_alloc(allocfunc); + hs_misc_free = normalise_free(freefunc); + + return HS_SUCCESS; +} + +HS_PUBLIC_API +hs_error_t HS_CDECL hs_set_scratch_allocator(hs_alloc_t allocfunc, + hs_free_t freefunc) { + hs_scratch_alloc = normalise_alloc(allocfunc); + hs_scratch_free = normalise_free(freefunc); + + return HS_SUCCESS; +} + +HS_PUBLIC_API +hs_error_t HS_CDECL hs_set_stream_allocator(hs_alloc_t allocfunc, + hs_free_t freefunc) { + hs_stream_alloc = normalise_alloc(allocfunc); + hs_stream_free = normalise_free(freefunc); + + return HS_SUCCESS; +} diff --git a/regex/allocator.h b/regex/allocator.h new file mode 100644 index 000000000..61c20f914 --- /dev/null +++ b/regex/allocator.h @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2015, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef ALLOCATOR_H +#define ALLOCATOR_H + +#include "hs_common.h" +#include "ue2common.h" + +#ifdef __cplusplus +extern "C" +{ +#endif +extern hs_alloc_t hs_database_alloc; +extern hs_alloc_t hs_misc_alloc; +extern hs_alloc_t hs_scratch_alloc; +extern hs_alloc_t hs_stream_alloc; + +extern hs_free_t hs_database_free; +extern hs_free_t hs_misc_free; +extern hs_free_t hs_scratch_free; +extern hs_free_t hs_stream_free; +#ifdef __cplusplus +} /* extern C */ +#endif +/** \brief Check the results of an alloc done with hs_alloc for alignment. + * + * If we have incorrect alignment, return an error. Caller should free the + * offending block. */ +static really_inline +hs_error_t hs_check_alloc(const void *mem) { + hs_error_t ret = HS_SUCCESS; + if (!mem) { + ret = HS_NOMEM; + } else if (!ISALIGNED_N(mem, alignof(unsigned long long))) { + ret = HS_BAD_ALLOC; + } + return ret; +} + +#endif diff --git a/regex/build.sh b/regex/build.sh new file mode 100755 index 000000000..48e65a18d --- /dev/null +++ b/regex/build.sh @@ -0,0 +1,17 @@ +#!/bin/bash -xe + +kernel_source_dir="$1" +linux_image=/boot/"vmlinuz-${kernelver}" +shift 1 + +# Make our own source tree and extract vmlinux into it. +subdirs=$(ls -A "${kernel_source_dir}"/) +mkdir -p linux +for d in $subdirs; do + ln -s "${kernel_source_dir}"/"$d" linux/"$d" +done + +linux/scripts/extract-vmlinux "${linux_image}" \ + > linux/vmlinux + +exec make -C linux "$@" diff --git a/regex/crc32.h b/regex/crc32.h new file mode 100644 index 000000000..f9c960c10 --- /dev/null +++ b/regex/crc32.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2015, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef CRC32_H_36A5015B5840C1 +#define CRC32_H_36A5015B5840C1 + +#include "ue2common.h" + +#ifndef __KERNEL__ + +#ifdef __cplusplus +extern "C" +{ +#endif + +u32 Crc32c_ComputeBuf(u32 inCrc32, const void *buf, size_t bufLen); + +#ifdef __cplusplus +} +#endif + +#else /* __KERNEL */ + +#include + +static inline u32 Crc32c_ComputeBuf(u32 inCrc32, const void *buf, size_t bufLen) { + return __crc32c_le(inCrc32, (unsigned char const*) buf, bufLen); +} + +#endif + +#endif /* CRC32_H_36A5015B5840C1 */ + diff --git a/regex/database.c b/regex/database.c new file mode 100644 index 000000000..09d49b0be --- /dev/null +++ b/regex/database.c @@ -0,0 +1,474 @@ +/* + * Copyright (c) 2015-2020, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Runtime code for hs_database manipulation. + */ + +#ifndef __KERNEL__ +#include +#include +#else +#include +#include +#endif + +#include "allocator.h" +#include "hs_common.h" +#include "hs_internal.h" +#include "hs_version.h" +#include "ue2common.h" +#include "database.h" +#include "crc32.h" +#include "rose/rose_internal.h" +#include "util/unaligned.h" + +static really_inline +int db_correctly_aligned(const void *db) { + return ISALIGNED_N(db, alignof(unsigned long long)); +} + +HS_PUBLIC_API +hs_error_t HS_CDECL hs_free_database(hs_database_t *db) { + if (db && db->magic != HS_DB_MAGIC) { + return HS_INVALID; + } + hs_database_free(db); + + return HS_SUCCESS; +} + +HS_PUBLIC_API +hs_error_t HS_CDECL hs_serialize_database(const hs_database_t *db, char **bytes, + size_t *serialized_length) { + if (!db || !bytes || (!serialized_length && !*bytes)) { + return HS_INVALID; + } + + if (!db_correctly_aligned(db)) { + return HS_BAD_ALIGN; + } + + hs_error_t ret = validDatabase(db); + if (ret != HS_SUCCESS) { + return ret; + } + + size_t length = sizeof(struct hs_database) + db->length; + char *out; + + if (serialized_length) { + out = hs_misc_alloc(length); + ret = hs_check_alloc(out); + if (ret != HS_SUCCESS) { + hs_misc_free(out); + return ret; + } + } else { + out = *bytes; + } + + memset(out, 0, length); + + u32 *buf = (u32 *)out; + *buf = db->magic; + buf++; + *buf = db->version; + buf++; + *buf = db->length; + buf++; + memcpy(buf, &db->platform, sizeof(u64a)); + buf += 2; + *buf = db->crc32; + buf++; + *buf = db->reserved0; + buf++; + *buf = db->reserved1; + buf++; + + const char *bytecode = hs_get_bytecode(db); + memcpy(buf, bytecode, db->length); + + if (serialized_length) { + *bytes = out; + *serialized_length = length; + } + return HS_SUCCESS; +} + +// check that the database header's platform is compatible with the current +// runtime platform. +static +hs_error_t db_check_platform(const u64a p) { + if (p != hs_current_platform + && p != (hs_current_platform | hs_current_platform_no_avx2) + && p != (hs_current_platform | hs_current_platform_no_avx512) + && p != (hs_current_platform | hs_current_platform_no_avx512vbmi)) { + return HS_DB_PLATFORM_ERROR; + } + // passed all checks + return HS_SUCCESS; +} + +// Decode and check the database header, returning appropriate errors or +// HS_SUCCESS if it's OK. The header should be allocated on the stack +// and later copied into the deserialized database. +static +hs_error_t db_decode_header(const char **bytes, const size_t length, + struct hs_database *header) { + if (!*bytes) { + return HS_INVALID; + } + + if (length < sizeof(struct hs_database)) { + return HS_INVALID; + } + + // There's no requirement, really, that the serialized stream of bytes + // we've been given is 4-byte aligned, so we use unaligned loads here. + + const u32 *buf = (const u32 *)*bytes; + + // Zero header so that none of it (e.g. its padding) is uninitialized. + memset(header, 0, sizeof(struct hs_database)); + + header->magic = unaligned_load_u32(buf++); + if (header->magic != HS_DB_MAGIC) { + return HS_INVALID; + } + + header->version = unaligned_load_u32(buf++); + if (header->version != HS_DB_VERSION) { + return HS_DB_VERSION_ERROR; + } + + header->length = unaligned_load_u32(buf++); + if (length != sizeof(struct hs_database) + header->length) { + DEBUG_PRINTF("bad length %zu, expecting %zu\n", length, + sizeof(struct hs_database) + header->length); + return HS_INVALID; + } + + header->platform = unaligned_load_u64a(buf); + buf += 2; + header->crc32 = unaligned_load_u32(buf++); + header->reserved0 = unaligned_load_u32(buf++); + header->reserved1 = unaligned_load_u32(buf++); + + *bytes = (const char *)buf; + + return HS_SUCCESS; // Header checks out +} + +// Check the CRC on a database +static +hs_error_t db_check_crc(const hs_database_t *db) { + const char *bytecode = hs_get_bytecode(db); + u32 crc = Crc32c_ComputeBuf(0, bytecode, db->length); + if (crc != db->crc32) { + DEBUG_PRINTF("crc mismatch! 0x%x != 0x%x\n", crc, db->crc32); + return HS_INVALID; + } + return HS_SUCCESS; +} + +static +void db_copy_bytecode(const char *serialized, hs_database_t *db) { + // we need to align things manually + uintptr_t shift = (uintptr_t)db->bytes & 0x3f; + db->bytecode = offsetof(struct hs_database, bytes) - shift; + char *bytecode = (char *)db + db->bytecode; + + // Copy the bytecode into place + memcpy(bytecode, serialized, db->length); +} + +HS_PUBLIC_API +hs_error_t HS_CDECL hs_deserialize_database_at(const char *bytes, + const size_t length, + hs_database_t *db) { + if (!bytes || !db) { + return HS_INVALID; + } + + // We require the user to deserialize into an 8-byte aligned region. + if (!ISALIGNED_N(db, 8)) { + return HS_BAD_ALIGN; + } + + // Decode the header + hs_database_t header; + hs_error_t ret = db_decode_header(&bytes, length, &header); + if (ret != HS_SUCCESS) { + return ret; + } + + // Make sure the serialized database is for our platform + ret = db_check_platform(header.platform); + if (ret != HS_SUCCESS) { + return ret; + } + + // Zero new space for safety + size_t dblength = sizeof(struct hs_database) + header.length; + memset(db, 0, dblength); + + // Copy the decoded header into place + memcpy(db, &header, sizeof(header)); + + // Copy the bytecode into the correctly-aligned location, set offsets + db_copy_bytecode(bytes, db); + + if (db_check_crc(db) != HS_SUCCESS) { + return HS_INVALID; + } + + return HS_SUCCESS; +} + +HS_PUBLIC_API +hs_error_t HS_CDECL hs_deserialize_database(const char *bytes, + const size_t length, + hs_database_t **db) { + if (!bytes || !db) { + return HS_INVALID; + } + + *db = NULL; + + // Decode and check the header + hs_database_t header; + hs_error_t ret = db_decode_header(&bytes, length, &header); + if (ret != HS_SUCCESS) { + return ret; + } + + // Make sure the serialized database is for our platform + ret = db_check_platform(header.platform); + if (ret != HS_SUCCESS) { + return ret; + } + + // Allocate space for new database + size_t dblength = sizeof(struct hs_database) + header.length; + struct hs_database *tempdb = hs_database_alloc(dblength); + ret = hs_check_alloc(tempdb); + if (ret != HS_SUCCESS) { + hs_database_free(tempdb); + return ret; + } + + // Zero new space for safety + memset(tempdb, 0, dblength); + + // Copy the decoded header into place + memcpy(tempdb, &header, sizeof(header)); + + // Copy the bytecode into the correctly-aligned location, set offsets + db_copy_bytecode(bytes, tempdb); + + if (db_check_crc(tempdb) != HS_SUCCESS) { + hs_database_free(tempdb); + return HS_INVALID; + } + + *db = tempdb; + return HS_SUCCESS; +} + +HS_PUBLIC_API +hs_error_t HS_CDECL hs_database_size(const hs_database_t *db, size_t *size) { + if (!size) { + return HS_INVALID; + } + + hs_error_t ret = validDatabase(db); + if (unlikely(ret != HS_SUCCESS)) { + return ret; + } + + *size = sizeof(struct hs_database) + db->length; + return HS_SUCCESS; +} + +HS_PUBLIC_API +hs_error_t HS_CDECL hs_serialized_database_size(const char *bytes, + const size_t length, + size_t *size) { + // Decode and check the header + hs_database_t header; + hs_error_t ret = db_decode_header(&bytes, length, &header); + if (ret != HS_SUCCESS) { + return ret; + } + + if (!size) { + return HS_INVALID; + } + + *size = sizeof(struct hs_database) + header.length; + return HS_SUCCESS; +} + +hs_error_t dbIsValid(const hs_database_t *db) { + if (db->magic != HS_DB_MAGIC) { + DEBUG_PRINTF("bad magic\n"); + return HS_INVALID; + } + + if (db->version != HS_DB_VERSION) { + DEBUG_PRINTF("bad version\n"); + return HS_DB_VERSION_ERROR; + } + + if (db_check_platform(db->platform) != HS_SUCCESS) { + DEBUG_PRINTF("bad platform\n"); + return HS_DB_PLATFORM_ERROR; + } + + if (!ISALIGNED_16(hs_get_bytecode(db))) { + DEBUG_PRINTF("bad alignment\n"); + return HS_INVALID; + } + + hs_error_t rv = db_check_crc(db); + if (rv != HS_SUCCESS) { + DEBUG_PRINTF("bad crc\n"); + return rv; + } + + return HS_SUCCESS; +} + +#if defined(_WIN32) +#define SNPRINTF_COMPAT _snprintf +#else +#define SNPRINTF_COMPAT snprintf +#endif + +/** Allocate a buffer and prints the database info into it. Returns an + * appropriate error code on failure, or HS_SUCCESS on success. */ +static +hs_error_t print_database_string(char **s, u32 version, const platform_t plat, + u32 raw_mode) { + assert(s); + *s = NULL; + + u8 release = (version >> 8) & 0xff; + u8 minor = (version >> 16) & 0xff; + u8 major = (version >> 24) & 0xff; + + const char *features = (plat & HS_PLATFORM_NOAVX512VBMI) + ? (plat & HS_PLATFORM_NOAVX512) + ? (plat & HS_PLATFORM_NOAVX2) ? "" : "AVX2" + : "AVX512" + : "AVX512VBMI"; + + const char *mode = NULL; + + if (raw_mode == HS_MODE_STREAM) { + mode = "STREAM"; + } else if (raw_mode == HS_MODE_VECTORED) { + mode = "VECTORED"; + } else { + assert(raw_mode == HS_MODE_BLOCK); + mode = "BLOCK"; + } + + // Initial allocation size, which should be large enough to print our info. + // If it isn't, snprintf will tell us and we can resize appropriately. + size_t len = 256; + + while (1) { + char *buf = hs_misc_alloc(len); + hs_error_t ret = hs_check_alloc(buf); + if (ret != HS_SUCCESS) { + hs_misc_free(buf); + return ret; + } + + // Note: SNPRINTF_COMPAT is a macro defined above, to cope with systems + // that don't have snprintf but have a workalike. + int p_len = SNPRINTF_COMPAT( + buf, len, "Version: %u.%u.%u Features: %s Mode: %s", + major, minor, release, features, mode); + if (p_len < 0) { + DEBUG_PRINTF("snprintf output error, returned %d\n", p_len); + hs_misc_free(buf); + break; + } else if ((size_t)p_len < len) { // output fit within buffer. + assert(buf[p_len] == '\0'); + *s = buf; + return HS_SUCCESS; + } else { // output didn't fit: resize and reallocate. + len = (size_t)p_len + 1; // must add one for null terminator. + hs_misc_free(buf); + } + } + + return HS_NOMEM; +} + +HS_PUBLIC_API +hs_error_t HS_CDECL hs_serialized_database_info(const char *bytes, + size_t length, char **info) { + if (!info) { + return HS_INVALID; + } + *info = NULL; + + // Decode and check the header + hs_database_t header; + hs_error_t ret = db_decode_header(&bytes, length, &header); + if (ret != HS_SUCCESS) { + return ret; + } + + u32 mode = unaligned_load_u32(bytes + offsetof(struct RoseEngine, mode)); + + return print_database_string(info, header.version, header.platform, mode); +} + +HS_PUBLIC_API +hs_error_t HS_CDECL hs_database_info(const hs_database_t *db, char **info) { + if (!info) { + return HS_INVALID; + } + *info = NULL; + + if (!db || !db_correctly_aligned(db) || db->magic != HS_DB_MAGIC) { + return HS_INVALID; + } + + platform_t plat; + plat = db->platform; + + const struct RoseEngine *rose = hs_get_bytecode(db); + + return print_database_string(info, db->version, plat, rose->mode); +} diff --git a/regex/database.h b/regex/database.h new file mode 100644 index 000000000..f122f97be --- /dev/null +++ b/regex/database.h @@ -0,0 +1,142 @@ +/* + * Copyright (c) 2015-2020, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Runtime code for hs_database manipulation. + */ + +#ifndef DATABASE_H_D467FD6F343DDE +#define DATABASE_H_D467FD6F343DDE + +#ifdef __cplusplus +extern "C" +{ +#endif + +#include "hs_compile.h" // for HS_MODE_ flags +#include "hs_version.h" +#include "ue2common.h" +#include "util/arch.h" + +#define HS_DB_VERSION HS_VERSION_32BIT +#define HS_DB_MAGIC (0xdbdbdbdbU) + +// Values in here cannot (easily) change - add new ones! + +// CPU type is the low 6 bits (we can't need more than 64, surely!) + +#define HS_PLATFORM_INTEL 1 +#define HS_PLATFORM_CPU_MASK 0x3F + +#define HS_PLATFORM_NOAVX2 (4<<13) +#define HS_PLATFORM_NOAVX512 (8<<13) +#define HS_PLATFORM_NOAVX512VBMI (0x10<<13) + +/** \brief Platform features bitmask. */ +typedef u64a platform_t; + +static UNUSED +const platform_t hs_current_platform = { +#if !defined(HAVE_AVX2) + HS_PLATFORM_NOAVX2 | +#endif +#if !defined(HAVE_AVX512) + HS_PLATFORM_NOAVX512 | +#endif +#if !defined(HAVE_AVX512VBMI) + HS_PLATFORM_NOAVX512VBMI | +#endif + 0, +}; + +static UNUSED +const platform_t hs_current_platform_no_avx2 = { + HS_PLATFORM_NOAVX2 | + HS_PLATFORM_NOAVX512 | + HS_PLATFORM_NOAVX512VBMI | + 0, +}; + +static UNUSED +const platform_t hs_current_platform_no_avx512 = { + HS_PLATFORM_NOAVX512 | + HS_PLATFORM_NOAVX512VBMI | + 0, +}; + +static UNUSED +const platform_t hs_current_platform_no_avx512vbmi = { + HS_PLATFORM_NOAVX512VBMI | + 0, +}; + +/* + * a header to enclose the actual bytecode - useful for keeping info about the + * compiled data. + */ +struct hs_database { + u32 magic; + u32 version; + u32 length; + u64a platform; + u32 crc32; + u32 reserved0; + u32 reserved1; + u32 bytecode; // offset relative to db start + u32 padding[16]; + char bytes[]; +}; + +static really_inline +const void *hs_get_bytecode(const struct hs_database *db) { + return ((const char *)db + db->bytecode); +} + +/** + * Cheap database sanity checks used in block mode scan calls and streaming + * mode open calls. + */ +static really_inline +hs_error_t validDatabase(const hs_database_t *db) { + if (!db || db->magic != HS_DB_MAGIC) { + return HS_INVALID; + } + if (db->version != HS_DB_VERSION) { + return HS_DB_VERSION_ERROR; + } + + return HS_SUCCESS; +} + +hs_error_t dbIsValid(const struct hs_database *db); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* DATABASE_H_D467FD6F343DDE */ diff --git a/regex/dkms.conf b/regex/dkms.conf new file mode 100644 index 000000000..ea75d2553 --- /dev/null +++ b/regex/dkms.conf @@ -0,0 +1,8 @@ +PACKAGE_NAME="linux-rex" +PACKAGE_VERSION="0.1" +BUILD_EXCLUSIVE_ARCH=x86_64 +MAKE[0]="./build.sh ${kernel_source_dir} M=${dkms_tree}/${PACKAGE_NAME}/${PACKAGE_VERSION}/build -j${parallel_jobs}" +CLEAN="make -C ${kernel_source_dir} M=${dkms_tree}/${PACKAGE_NAME}/${PACKAGE_VERSION}/build clean" +AUTOINSTALL=yes +BUILT_MODULE_NAME[0]="xdp_rex" +DEST_MODULE_LOCATION[0]=/extra diff --git a/regex/fdr/fdr.c b/regex/fdr/fdr.c new file mode 100644 index 000000000..d33756d35 --- /dev/null +++ b/regex/fdr/fdr.c @@ -0,0 +1,881 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "fdr.h" +#include "fdr_confirm.h" +#include "fdr_confirm_runtime.h" +#include "fdr_internal.h" +#include "fdr_loadval.h" +#include "flood_runtime.h" +#include "scratch.h" +#include "teddy.h" +#include "teddy_internal.h" +#include "util/arch.h" +#include "util/simd_utils.h" +#include "util/uniform_ops.h" + +/** \brief number of bytes processed in each iteration */ +#define ITER_BYTES 16 + +/** \brief total zone buffer size */ +#define ZONE_TOTAL_SIZE 64 + +/** \brief maximum number of allowed zones */ +#define ZONE_MAX 3 + +/** \brief zone information. + * + * Zone represents a region of data to scan in FDR. + * + * The incoming buffer is to split in multiple zones to ensure two properties: + * 1: that we can read 8? bytes behind to generate a hash safely + * 2: that we can read the 3 byte after the current byte (domain > 8) + */ +struct zone { + /** \brief copied buffer, used only when it is a boundary zone. */ + u8 ALIGN_CL_DIRECTIVE buf[ZONE_TOTAL_SIZE]; + + /** \brief shift amount for fdr state to avoid unwanted match. */ + u8 shift; + + /** \brief if boundary zone, start points into the zone buffer after the + * pre-padding. Otherwise, points to the main buffer, appropriately. */ + const u8 *start; + + /** \brief if boundary zone, end points to the end of zone. Otherwise, + * pointer to the main buffer, appropriately. */ + const u8 *end; + + /** \brief the amount to adjust to go from a pointer in the zones region + * (between start and end) to a pointer in the original data buffer. */ + ptrdiff_t zone_pointer_adjust; + + /** \brief firstFloodDetect from FDR_Runtime_Args for non-boundary zones, + * otherwise end of the zone buf. floodPtr always points inside the same + * buffer as the start pointe. */ + const u8 *floodPtr; +}; + +static +const ALIGN_CL_DIRECTIVE u8 zone_or_mask[ITER_BYTES+1][ITER_BYTES] = { + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00 }, + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 } +}; + +/* compilers don't reliably synthesize the 32-bit ANDN instruction here, + * so we force its generation. + */ +static really_inline +u64a andn(const u32 a, const u8 *b) { + u64a r; +#if defined(HAVE_BMI) && !defined(NO_ASM) + __asm__ ("andn\t%2,%1,%k0" : "=r"(r) : "r"(a), "m"(*(const u32 *)b)); +#else + r = unaligned_load_u32(b) & ~a; +#endif + return r; +} + +/* generates an initial state mask based on the last byte-ish of history rather + * than being all accepting. If there is no history to consider, the state is + * generated based on the minimum length of each bucket in order to prevent + * confirms. + */ +static really_inline +m128 getInitState(const struct FDR *fdr, u8 len_history, const u64a *ft, + const struct zone *z) { + m128 s; + if (len_history) { + /* +1: the zones ensure that we can read the byte at z->end */ + u32 tmp = lv_u16(z->start + z->shift - 1, z->buf, z->end + 1); + tmp &= fdr->domainMask; + s = load_m128_from_u64a(ft + tmp); + s = rshiftbyte_m128(s, 1); + } else { + s = fdr->start; + } + return s; +} + +static really_inline +void get_conf_stride_1(const u8 *itPtr, UNUSED const u8 *start_ptr, + UNUSED const u8 *end_ptr, u32 domain_mask_flipped, + const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) { + /* +1: the zones ensure that we can read the byte at z->end */ + assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr); + u64a reach0 = andn(domain_mask_flipped, itPtr); + u64a reach1 = andn(domain_mask_flipped, itPtr + 1); + u64a reach2 = andn(domain_mask_flipped, itPtr + 2); + u64a reach3 = andn(domain_mask_flipped, itPtr + 3); + + m128 st0 = load_m128_from_u64a(ft + reach0); + m128 st1 = load_m128_from_u64a(ft + reach1); + m128 st2 = load_m128_from_u64a(ft + reach2); + m128 st3 = load_m128_from_u64a(ft + reach3); + + u64a reach4 = andn(domain_mask_flipped, itPtr + 4); + u64a reach5 = andn(domain_mask_flipped, itPtr + 5); + u64a reach6 = andn(domain_mask_flipped, itPtr + 6); + u64a reach7 = andn(domain_mask_flipped, itPtr + 7); + + m128 st4 = load_m128_from_u64a(ft + reach4); + m128 st5 = load_m128_from_u64a(ft + reach5); + m128 st6 = load_m128_from_u64a(ft + reach6); + m128 st7 = load_m128_from_u64a(ft + reach7); + + st1 = lshiftbyte_m128(st1, 1); + st2 = lshiftbyte_m128(st2, 2); + st3 = lshiftbyte_m128(st3, 3); + st4 = lshiftbyte_m128(st4, 4); + st5 = lshiftbyte_m128(st5, 5); + st6 = lshiftbyte_m128(st6, 6); + st7 = lshiftbyte_m128(st7, 7); + + st0 = or128(st0, st1); + st2 = or128(st2, st3); + st4 = or128(st4, st5); + st6 = or128(st6, st7); + st0 = or128(st0, st2); + st4 = or128(st4, st6); + st0 = or128(st0, st4); + *s = or128(*s, st0); + + *conf0 = movq(*s); + *s = rshiftbyte_m128(*s, 8); + *conf0 ^= ~0ULL; + + u64a reach8 = andn(domain_mask_flipped, itPtr + 8); + u64a reach9 = andn(domain_mask_flipped, itPtr + 9); + u64a reach10 = andn(domain_mask_flipped, itPtr + 10); + u64a reach11 = andn(domain_mask_flipped, itPtr + 11); + + m128 st8 = load_m128_from_u64a(ft + reach8); + m128 st9 = load_m128_from_u64a(ft + reach9); + m128 st10 = load_m128_from_u64a(ft + reach10); + m128 st11 = load_m128_from_u64a(ft + reach11); + + u64a reach12 = andn(domain_mask_flipped, itPtr + 12); + u64a reach13 = andn(domain_mask_flipped, itPtr + 13); + u64a reach14 = andn(domain_mask_flipped, itPtr + 14); + u64a reach15 = andn(domain_mask_flipped, itPtr + 15); + + m128 st12 = load_m128_from_u64a(ft + reach12); + m128 st13 = load_m128_from_u64a(ft + reach13); + m128 st14 = load_m128_from_u64a(ft + reach14); + m128 st15 = load_m128_from_u64a(ft + reach15); + + st9 = lshiftbyte_m128(st9, 1); + st10 = lshiftbyte_m128(st10, 2); + st11 = lshiftbyte_m128(st11, 3); + st12 = lshiftbyte_m128(st12, 4); + st13 = lshiftbyte_m128(st13, 5); + st14 = lshiftbyte_m128(st14, 6); + st15 = lshiftbyte_m128(st15, 7); + + st8 = or128(st8, st9); + st10 = or128(st10, st11); + st12 = or128(st12, st13); + st14 = or128(st14, st15); + st8 = or128(st8, st10); + st12 = or128(st12, st14); + st8 = or128(st8, st12); + *s = or128(*s, st8); + + *conf8 = movq(*s); + *s = rshiftbyte_m128(*s, 8); + *conf8 ^= ~0ULL; +} + +static really_inline +void get_conf_stride_2(const u8 *itPtr, UNUSED const u8 *start_ptr, + UNUSED const u8 *end_ptr, u32 domain_mask_flipped, + const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) { + assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr); + u64a reach0 = andn(domain_mask_flipped, itPtr); + u64a reach2 = andn(domain_mask_flipped, itPtr + 2); + u64a reach4 = andn(domain_mask_flipped, itPtr + 4); + u64a reach6 = andn(domain_mask_flipped, itPtr + 6); + + m128 st0 = load_m128_from_u64a(ft + reach0); + m128 st2 = load_m128_from_u64a(ft + reach2); + m128 st4 = load_m128_from_u64a(ft + reach4); + m128 st6 = load_m128_from_u64a(ft + reach6); + + u64a reach8 = andn(domain_mask_flipped, itPtr + 8); + u64a reach10 = andn(domain_mask_flipped, itPtr + 10); + u64a reach12 = andn(domain_mask_flipped, itPtr + 12); + u64a reach14 = andn(domain_mask_flipped, itPtr + 14); + + m128 st8 = load_m128_from_u64a(ft + reach8); + m128 st10 = load_m128_from_u64a(ft + reach10); + m128 st12 = load_m128_from_u64a(ft + reach12); + m128 st14 = load_m128_from_u64a(ft + reach14); + + st2 = lshiftbyte_m128(st2, 2); + st4 = lshiftbyte_m128(st4, 4); + st6 = lshiftbyte_m128(st6, 6); + + *s = or128(*s, st0); + *s = or128(*s, st2); + *s = or128(*s, st4); + *s = or128(*s, st6); + + *conf0 = movq(*s); + *s = rshiftbyte_m128(*s, 8); + *conf0 ^= ~0ULL; + + st10 = lshiftbyte_m128(st10, 2); + st12 = lshiftbyte_m128(st12, 4); + st14 = lshiftbyte_m128(st14, 6); + + *s = or128(*s, st8); + *s = or128(*s, st10); + *s = or128(*s, st12); + *s = or128(*s, st14); + + *conf8 = movq(*s); + *s = rshiftbyte_m128(*s, 8); + *conf8 ^= ~0ULL; +} + +static really_inline +void get_conf_stride_4(const u8 *itPtr, UNUSED const u8 *start_ptr, + UNUSED const u8 *end_ptr, u32 domain_mask_flipped, + const u64a *ft, u64a *conf0, u64a *conf8, m128 *s) { + assert(itPtr >= start_ptr && itPtr + ITER_BYTES <= end_ptr); + u64a reach0 = andn(domain_mask_flipped, itPtr); + u64a reach4 = andn(domain_mask_flipped, itPtr + 4); + u64a reach8 = andn(domain_mask_flipped, itPtr + 8); + u64a reach12 = andn(domain_mask_flipped, itPtr + 12); + + m128 st0 = load_m128_from_u64a(ft + reach0); + m128 st4 = load_m128_from_u64a(ft + reach4); + m128 st8 = load_m128_from_u64a(ft + reach8); + m128 st12 = load_m128_from_u64a(ft + reach12); + + st4 = lshiftbyte_m128(st4, 4); + st12 = lshiftbyte_m128(st12, 4); + + *s = or128(*s, st0); + *s = or128(*s, st4); + *conf0 = movq(*s); + *s = rshiftbyte_m128(*s, 8); + *conf0 ^= ~0ULL; + + *s = or128(*s, st8); + *s = or128(*s, st12); + *conf8 = movq(*s); + *s = rshiftbyte_m128(*s, 8); + *conf8 ^= ~0ULL; +} + +static really_inline +void do_confirm_fdr(u64a *conf, u8 offset, hwlmcb_rv_t *control, + const u32 *confBase, const struct FDR_Runtime_Args *a, + const u8 *ptr, u32 *last_match_id, struct zone *z) { + const u8 bucket = 8; + + if (likely(!*conf)) { + return; + } + + /* ptr is currently referring to a location in the zone's buffer, we also + * need a pointer in the original, main buffer for the final string compare. + */ + const u8 *ptr_main = (const u8 *)((uintptr_t)ptr + z->zone_pointer_adjust); + + const u8 *confLoc = ptr; + + do { + u32 bit = findAndClearLSB_64(conf); + u32 byte = bit / bucket + offset; + u32 bitRem = bit % bucket; + u32 idx = bitRem; + u32 cf = confBase[idx]; + if (!cf) { + continue; + } + const struct FDRConfirm *fdrc = (const struct FDRConfirm *) + ((const u8 *)confBase + cf); + if (!(fdrc->groups & *control)) { + continue; + } + u64a confVal = unaligned_load_u64a(confLoc + byte - sizeof(u64a) + 1); + confWithBit(fdrc, a, ptr_main - a->buf + byte, control, + last_match_id, confVal, conf, bit); + } while (unlikely(!!*conf)); +} + +static really_inline +void dumpZoneInfo(UNUSED struct zone *z, UNUSED size_t zone_id) { +#ifdef DEBUG + DEBUG_PRINTF("zone: zone=%zu, bufPtr=%p\n", zone_id, z->buf); + DEBUG_PRINTF("zone: startPtr=%p, endPtr=%p, shift=%u\n", + z->start, z->end, z->shift); + DEBUG_PRINTF("zone: zone_pointer_adjust=%zd, floodPtr=%p\n", + z->zone_pointer_adjust, z->floodPtr); + DEBUG_PRINTF("zone buf:"); + for (size_t i = 0; i < ZONE_TOTAL_SIZE; i++) { + if (i % 8 == 0) { + printf("_"); + } + if (z->buf[i]) { + printf("%02x", z->buf[i]); + } else { + printf(".."); + } + } + printf("\n"); +#endif +}; + +/** + * \brief Updates attributes for non-boundary region zone. + */ +static really_inline +void createMainZone(const u8 *flood, const u8 *begin, const u8 *end, + struct zone *z) { + z->zone_pointer_adjust = 0; /* zone buffer is the main buffer */ + z->start = begin; + z->end = end; + z->floodPtr = flood; + z->shift = 0; +} + +/** + * \brief Create zone for short cases (<= ITER_BYTES). + * + * For this case we need to copy everything into the zone's internal buffer. + * + * We need to ensure that we run over real data if it exists (in history or + * before zone begin). We also need to ensure 8 bytes before any data being + * matched can be read (to perform a conf hash). + * + * We also need to ensure that the data at z->end can be read. + * + * Hence, the zone consists of: + * 16 bytes of history, + * 1 - 24 bytes of data form the buffer (ending at end), + * 1 byte of final padding + */ +static really_inline +void createShortZone(const u8 *buf, const u8 *hend, const u8 *begin, + const u8 *end, struct zone *z) { + /* the floodPtr for BOUNDARY zones are maximum of end of zone buf to avoid + * the checks in boundary zone. */ + z->floodPtr = z->buf + ZONE_TOTAL_SIZE; + + ptrdiff_t z_len = end - begin; + assert(z_len > 0); + assert(z_len <= ITER_BYTES); + + z->shift = ITER_BYTES - z_len; /* ignore bytes outside region specified */ + + static const size_t ZONE_SHORT_DATA_OFFSET = 16; /* after history */ + + /* we are guaranteed to always have 16 initialised bytes at the end of + * the history buffer (they may be garbage coming from the stream state + * preceding hbuf, but bytes that don't correspond to actual history + * shouldn't affect computations). */ + *(m128 *)z->buf = loadu128(hend - sizeof(m128)); + + /* The amount of data we have to copy from main buffer. */ + size_t copy_len = MIN((size_t)(end - buf), + ITER_BYTES + sizeof(CONF_TYPE)); + + u8 *zone_data = z->buf + ZONE_SHORT_DATA_OFFSET; + switch (copy_len) { + case 1: + *zone_data = *(end - 1); + break; + case 2: + *(u16 *)zone_data = unaligned_load_u16(end - 2); + break; + case 3: + *(u16 *)zone_data = unaligned_load_u16(end - 3); + *(zone_data + 2) = *(end - 1); + break; + case 4: + *(u32 *)zone_data = unaligned_load_u32(end - 4); + break; + case 5: + case 6: + case 7: + /* perform copy with 2 overlapping 4-byte chunks from buf. */ + *(u32 *)zone_data = unaligned_load_u32(end - copy_len); + unaligned_store_u32(zone_data + copy_len - sizeof(u32), + unaligned_load_u32(end - sizeof(u32))); + break; + case 8: + *(u64a *)zone_data = unaligned_load_u64a(end - 8); + break; + case 9: + case 10: + case 11: + case 12: + case 13: + case 14: + case 15: + /* perform copy with 2 overlapping 8-byte chunks from buf. */ + *(u64a *)zone_data = unaligned_load_u64a(end - copy_len); + unaligned_store_u64a(zone_data + copy_len - sizeof(u64a), + unaligned_load_u64a(end - sizeof(u64a))); + break; + case 16: + /* copy 16-bytes from buf. */ + *(m128 *)zone_data = loadu128(end - 16); + break; + default: + assert(copy_len <= sizeof(m128) + sizeof(u64a)); + + /* perform copy with (potentially overlapping) 8-byte and 16-byte chunks. + */ + *(u64a *)zone_data = unaligned_load_u64a(end - copy_len); + storeu128(zone_data + copy_len - sizeof(m128), + loadu128(end - sizeof(m128))); + break; + } + + /* set the start and end location of the zone buf + * to be scanned */ + u8 *z_end = z->buf + ZONE_SHORT_DATA_OFFSET + copy_len; + assert(ZONE_SHORT_DATA_OFFSET + copy_len >= ITER_BYTES); + + /* copy the post-padding byte; this is required for domain > 8 due to + * overhang */ + assert(ZONE_SHORT_DATA_OFFSET + copy_len + 3 < 64); + *z_end = 0; + + z->end = z_end; + z->start = z_end - ITER_BYTES; + z->zone_pointer_adjust = (ptrdiff_t)((uintptr_t)end - (uintptr_t)z_end); + assert(z->start + z->shift == z_end - z_len); +} + +/** + * \brief Create a zone for the start region. + * + * This function requires that there is > ITER_BYTES of data in the buffer to + * scan. The start zone itself is always responsible for scanning exactly + * ITER_BYTES of data - there are no warmup/junk bytes scanned. + * + * This zone ensures that the byte at z->end can be read and corresponds to + * the next byte of data. + * + * 8 bytes of history data are provided before z->start to allow proper hash + * generation in streaming mode. If buf != begin, upto 8 bytes of data + * prior to begin is also provided. + * + * Although we are not interested in bare literals which start before begin + * if buf != begin, lookarounds associated with the literal may require + * the data prior to begin for hash purposes. + */ +static really_inline +void createStartZone(const u8 *buf, const u8 *hend, const u8 *begin, + struct zone *z) { + assert(ITER_BYTES == sizeof(m128)); + assert(sizeof(CONF_TYPE) == 8); + static const size_t ZONE_START_BEGIN = sizeof(CONF_TYPE); + + const u8 *end = begin + ITER_BYTES; + + /* set floodPtr to the end of zone buf to avoid checks in start zone */ + z->floodPtr = z->buf + ZONE_TOTAL_SIZE; + + z->shift = 0; /* we are processing ITER_BYTES of real data */ + + /* we are guaranteed to always have 16 initialised bytes at the end of the + * history buffer (they may be garbage coming from the stream state + * preceding hbuf, but bytes that don't correspond to actual history + * shouldn't affect computations). However, for start zones, history is only + * required for conf hash purposes so we only need 8 bytes */ + unaligned_store_u64a(z->buf, unaligned_load_u64a(hend - sizeof(u64a))); + + /* The amount of data we have to copy from main buffer. */ + size_t copy_len = MIN((size_t)(end - buf), + ITER_BYTES + sizeof(CONF_TYPE)); + assert(copy_len >= 16); + + /* copy the post-padding byte; this is required for domain > 8 due to + * overhang. The start requires that there is data after the zone so it + * it safe to dereference end */ + z->buf[ZONE_START_BEGIN + copy_len] = *end; + + /* set the start and end location of the zone buf to be scanned */ + u8 *z_end = z->buf + ZONE_START_BEGIN + copy_len; + z->end = z_end; + z->start = z_end - ITER_BYTES; + + /* copy the first 8 bytes of the valid region */ + unaligned_store_u64a(z->buf + ZONE_START_BEGIN, + unaligned_load_u64a(end - copy_len)); + + /* copy the last 16 bytes, may overlap with the previous 8 byte write */ + storeu128(z_end - sizeof(m128), loadu128(end - sizeof(m128))); + + z->zone_pointer_adjust = (ptrdiff_t)((uintptr_t)end - (uintptr_t)z_end); + + assert(ZONE_START_BEGIN + copy_len + 3 < 64); +} + +/** + * \brief Create a zone for the end region. + * + * This function requires that there is > ITER_BYTES of data in the buffer to + * scan. The end zone is responsible for a scanning the <= ITER_BYTES rump of + * data and optional ITER_BYTES. The main zone cannot handle the last 3 bytes + * of the buffer. The end zone is required to handle an optional full + * ITER_BYTES from main zone when there are less than 3 bytes to scan. The + * main zone size is reduced by ITER_BYTES in this case. + * + * This zone ensures that the byte at z->end can be read by filling it with a + * padding character. + * + * Upto 8 bytes of data prior to begin is also provided for the purposes of + * generating hashes. History is not copied, as all locations which require + * history for generating a hash are the responsiblity of the start zone. + */ +static really_inline +void createEndZone(const u8 *buf, const u8 *begin, const u8 *end, + struct zone *z) { + /* the floodPtr for BOUNDARY zones are maximum of end of zone buf to avoid + * the checks in boundary zone. */ + z->floodPtr = z->buf + ZONE_TOTAL_SIZE; + + ptrdiff_t z_len = end - begin; + assert(z_len > 0); + size_t iter_bytes_second = 0; + size_t z_len_first = z_len; + if (z_len > ITER_BYTES) { + z_len_first = z_len - ITER_BYTES; + iter_bytes_second = ITER_BYTES; + } + z->shift = ITER_BYTES - z_len_first; + + const u8 *end_first = end - iter_bytes_second; + /* The amount of data we have to copy from main buffer for the + * first iteration. */ + size_t copy_len_first = MIN((size_t)(end_first - buf), + ITER_BYTES + sizeof(CONF_TYPE)); + assert(copy_len_first >= 16); + + size_t total_copy_len = copy_len_first + iter_bytes_second; + assert(total_copy_len + 3 < 64); + + /* copy the post-padding byte; this is required for domain > 8 due to + * overhang */ + z->buf[total_copy_len] = 0; + + /* set the start and end location of the zone buf + * to be scanned */ + u8 *z_end = z->buf + total_copy_len; + z->end = z_end; + z->start = z_end - ITER_BYTES - iter_bytes_second; + assert(z->start + z->shift == z_end - z_len); + + u8 *z_end_first = z_end - iter_bytes_second; + /* copy the first 8 bytes of the valid region */ + unaligned_store_u64a(z->buf, + unaligned_load_u64a(end_first - copy_len_first)); + + /* copy the last 16 bytes, may overlap with the previous 8 byte write */ + storeu128(z_end_first - sizeof(m128), loadu128(end_first - sizeof(m128))); + if (iter_bytes_second) { + storeu128(z_end - sizeof(m128), loadu128(end - sizeof(m128))); + } + + z->zone_pointer_adjust = (ptrdiff_t)((uintptr_t)end - (uintptr_t)z_end); +} + +/** + * \brief Prepare zones. + * + * This function prepares zones with actual buffer and some padded bytes. + * The actual ITER_BYTES bytes in zone is preceded by main buf and/or + * history buf and succeeded by padded bytes possibly from main buf, + * if available. + */ +static really_inline +size_t prepareZones(const u8 *buf, size_t len, const u8 *hend, + size_t start, const u8 *flood, struct zone *zoneArr) { + const u8 *ptr = buf + start; + size_t remaining = len - start; + + if (remaining <= ITER_BYTES) { + /* enough bytes to make only one zone */ + createShortZone(buf, hend, ptr, buf + len, &zoneArr[0]); + return 1; + } + + /* enough bytes to make more than one zone */ + + size_t numZone = 0; + createStartZone(buf, hend, ptr, &zoneArr[numZone++]); + ptr += ITER_BYTES; + + assert(ptr < buf + len); + + /* find maximum buffer location that the main zone can scan + * - must be a multiple of ITER_BYTES, and + * - cannot contain the last 3 bytes (due to 3 bytes read behind the + end of buffer in FDR main loop) + */ + const u8 *main_end = buf + start + ROUNDDOWN_N(len - start - 3, ITER_BYTES); + + /* create a zone if multiple of ITER_BYTES are found */ + if (main_end > ptr) { + createMainZone(flood, ptr, main_end, &zoneArr[numZone++]); + ptr = main_end; + } + /* create a zone with rest of the data from the main buffer */ + createEndZone(buf, ptr, buf + len, &zoneArr[numZone++]); + return numZone; +} + +#define INVALID_MATCH_ID (~0U) + +#define FDR_MAIN_LOOP(zz, s, get_conf_fn) \ + do { \ + const u8 *tryFloodDetect = zz->floodPtr; \ + const u8 *start_ptr = zz->start; \ + const u8 *end_ptr = zz->end; \ + \ + for (const u8 *itPtr = start_ptr; itPtr + ITER_BYTES <= end_ptr; \ + itPtr += ITER_BYTES) { \ + if (unlikely(itPtr > tryFloodDetect)) { \ + tryFloodDetect = floodDetect(fdr, a, &itPtr, tryFloodDetect,\ + &floodBackoff, &control, \ + ITER_BYTES); \ + if (unlikely(control == HWLM_TERMINATE_MATCHING)) { \ + return HWLM_TERMINATED; \ + } \ + } \ + __builtin_prefetch(itPtr + ITER_BYTES); \ + u64a conf0; \ + u64a conf8; \ + get_conf_fn(itPtr, start_ptr, end_ptr, domain_mask_flipped, \ + ft, &conf0, &conf8, &s); \ + do_confirm_fdr(&conf0, 0, &control, confBase, a, itPtr, \ + &last_match_id, zz); \ + do_confirm_fdr(&conf8, 8, &control, confBase, a, itPtr, \ + &last_match_id, zz); \ + if (unlikely(control == HWLM_TERMINATE_MATCHING)) { \ + return HWLM_TERMINATED; \ + } \ + } /* end for loop */ \ + } while (0) \ + +static never_inline +hwlm_error_t fdr_engine_exec(const struct FDR *fdr, + const struct FDR_Runtime_Args *a, + hwlm_group_t control) { + assert(ISALIGNED_CL(fdr)); + + u32 floodBackoff = FLOOD_BACKOFF_START; + u32 last_match_id = INVALID_MATCH_ID; + u32 domain_mask_flipped = ~fdr->domainMask; + u8 stride = fdr->stride; + const u64a *ft = + (const u64a *)((const u8 *)fdr + ROUNDUP_CL(sizeof(struct FDR))); + assert(ISALIGNED_CL(ft)); + const u32 *confBase = (const u32 *)((const u8 *)fdr + fdr->confOffset); + assert(ISALIGNED_CL(confBase)); + struct zone zones[ZONE_MAX]; + assert(fdr->domain > 8 && fdr->domain < 16); + + size_t numZone = prepareZones(a->buf, a->len, + a->buf_history + a->len_history, + a->start_offset, a->firstFloodDetect, zones); + assert(numZone <= ZONE_MAX); + m128 state = getInitState(fdr, a->len_history, ft, &zones[0]); + + for (size_t curZone = 0; curZone < numZone; curZone++) { + struct zone *z = &zones[curZone]; + dumpZoneInfo(z, curZone); + + /* When a zone contains less data than is processed in an iteration + * of FDR_MAIN_LOOP(), we need to scan over some extra data. + * + * We have chosen to scan this extra data at the start of the + * iteration. The extra data is either data we have already scanned or + * garbage (if it is earlier than offset 0), + * + * As a result we need to shift the incoming state back so that it will + * properly line up with the data being scanned. + * + * We also need to forbid reporting any matches in the data being + * rescanned as they have already been reported (or are over garbage but + * later stages should also provide that safety guarantee). + */ + + u8 shift = z->shift; + + state = variable_byte_shift_m128(state, shift); + + state = or128(state, load128(zone_or_mask[shift])); + + switch (stride) { + case 1: + FDR_MAIN_LOOP(z, state, get_conf_stride_1); + break; + case 2: + FDR_MAIN_LOOP(z, state, get_conf_stride_2); + break; + case 4: + FDR_MAIN_LOOP(z, state, get_conf_stride_4); + break; + default: + break; + } + } + + return HWLM_SUCCESS; +} + +#if defined(HAVE_AVX2) +#define ONLY_AVX2(func) func +#else +#define ONLY_AVX2(func) NULL +#endif + +typedef hwlm_error_t (*FDRFUNCTYPE)(const struct FDR *fdr, + const struct FDR_Runtime_Args *a, + hwlm_group_t control); + +static const FDRFUNCTYPE funcs[] = { + fdr_engine_exec, + NULL, /* old: fast teddy */ + NULL, /* old: fast teddy */ + ONLY_AVX2(fdr_exec_fat_teddy_msks1), + ONLY_AVX2(fdr_exec_fat_teddy_msks1_pck), + ONLY_AVX2(fdr_exec_fat_teddy_msks2), + ONLY_AVX2(fdr_exec_fat_teddy_msks2_pck), + ONLY_AVX2(fdr_exec_fat_teddy_msks3), + ONLY_AVX2(fdr_exec_fat_teddy_msks3_pck), + ONLY_AVX2(fdr_exec_fat_teddy_msks4), + ONLY_AVX2(fdr_exec_fat_teddy_msks4_pck), + fdr_exec_teddy_msks1, + fdr_exec_teddy_msks1_pck, + fdr_exec_teddy_msks2, + fdr_exec_teddy_msks2_pck, + fdr_exec_teddy_msks3, + fdr_exec_teddy_msks3_pck, + fdr_exec_teddy_msks4, + fdr_exec_teddy_msks4_pck, +}; + +#define FAKE_HISTORY_SIZE 16 +static const u8 fake_history[FAKE_HISTORY_SIZE]; + +hwlm_error_t fdrExec(const struct FDR *fdr, const u8 *buf, size_t len, + size_t start, HWLMCallback cb, + struct hs_scratch *scratch, hwlm_group_t groups) { + // We guarantee (for safezone construction) that it is safe to read 16 + // bytes before the end of the history buffer. + const u8 *hbuf = fake_history + FAKE_HISTORY_SIZE; + + const struct FDR_Runtime_Args a = { + buf, + len, + hbuf, + 0, + start, + cb, + scratch, + nextFloodDetect(buf, len, FLOOD_BACKOFF_START), + 0 + }; + if (unlikely(a.start_offset >= a.len)) { + return HWLM_SUCCESS; + } else { + assert(funcs[fdr->engineID]); + return funcs[fdr->engineID](fdr, &a, groups); + } +} + +hwlm_error_t fdrExecStreaming(const struct FDR *fdr, const u8 *hbuf, + size_t hlen, const u8 *buf, size_t len, + size_t start, HWLMCallback cb, + struct hs_scratch *scratch, + hwlm_group_t groups) { + struct FDR_Runtime_Args a = { + buf, + len, + hbuf, + hlen, + start, + cb, + scratch, + nextFloodDetect(buf, len, FLOOD_BACKOFF_START), + /* we are guaranteed to always have 16 initialised bytes at the end of + * the history buffer (they may be garbage). */ + hbuf ? unaligned_load_u64a(hbuf + hlen - sizeof(u64a)) : (u64a)0 + }; + + hwlm_error_t ret; + if (unlikely(a.start_offset >= a.len)) { + ret = HWLM_SUCCESS; + } else { + assert(funcs[fdr->engineID]); + ret = funcs[fdr->engineID](fdr, &a, groups); + } + + return ret; +} diff --git a/regex/fdr/fdr.h b/regex/fdr/fdr.h new file mode 100644 index 000000000..4dcef851d --- /dev/null +++ b/regex/fdr/fdr.h @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief FDR literal matcher: runtime API. + */ + +#ifndef FDR_H +#define FDR_H + +#include "ue2common.h" +#include "hwlm/hwlm.h" + +// C linkage in the API +#ifdef __cplusplus +extern "C" { +#endif + +struct FDR; +struct hs_scratch; + +/** + * \brief Block-mode scan. + * + * \param fdr FDR matcher engine. + * \param buf Buffer to scan. + * \param len Length of buffer to scan. + * \param start First offset in buf at which a match may start. + * \param cb Callback to call when a match is found. + * \param scratch Scratch supplied to callback on match. + * \param groups Initial groups mask. + */ +hwlm_error_t fdrExec(const struct FDR *fdr, const u8 *buf, size_t len, + size_t start, HWLMCallback cb, struct hs_scratch *scratch, + hwlm_group_t groups); + +/** + * \brief Streaming-mode scan. + * + * \param fdr FDR matcher engine. + * \param hbuf History buffer. + * \param hlen Length of history buffer (hbuf). + * \param buf Buffer to scan. + * \param len Length of buffer to scan (buf). + * \param start First offset in buf at which a match may start. + * \param cb Callback to call when a match is found. + * \param scratch Scratch supplied to callback on match. + * \param groups Initial groups mask. + */ +hwlm_error_t fdrExecStreaming(const struct FDR *fdr, const u8 *hbuf, + size_t hlen, const u8 *buf, size_t len, + size_t start, HWLMCallback cb, + struct hs_scratch *scratch, + hwlm_group_t groups); + +#ifdef __cplusplus +} +#endif // __cplusplus + +#endif // FDR_H diff --git a/regex/fdr/fdr_confirm.h b/regex/fdr/fdr_confirm.h new file mode 100644 index 000000000..a23082cc6 --- /dev/null +++ b/regex/fdr/fdr_confirm.h @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2015-2019, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef FDR_CONFIRM_H +#define FDR_CONFIRM_H + +#include "ue2common.h" +#include "hwlm/hwlm.h" + +static really_inline +u32 mul_hash_64(u64a lv, u64a andmsk, u64a mult, u32 nBits) { + return ((lv & andmsk) * mult) >> (sizeof(u64a)*8 - nBits); +} + +// data structures +// TODO: fix this hard-coding +#define CONF_TYPE u64a +#define CONF_HASH_CALL mul_hash_64 + +/** + * \brief Flag indicating this literal doesn't need to be delivered more than + * once, used in LitInfo::flags. + */ +#define FDR_LIT_FLAG_NOREPEAT 1 + +/** + * \brief Structure describing a literal, linked to by FDRConfirm. + * + * This structure is followed in memory by a variable-sized string prefix, for + * strings that are longer than CONF_TYPE. + */ +struct LitInfo { + CONF_TYPE v; + CONF_TYPE msk; + hwlm_group_t groups; + u32 id; // literal ID as passed in + u8 size; + u8 flags; //!< bitfield of flags from FDR_LIT_FLAG_* above. + u8 next; +}; + +#define FDRC_FLAG_NO_CONFIRM 1 +#define FDRC_FLAG_NOREPEAT 2 + +/** + * \brief FDR confirm header. + * + * This structure is followed in memory by: + * + * -# lit index mapping (array of u32) + * -# list of LitInfo structures + */ +struct FDRConfirm { + CONF_TYPE andmsk; + CONF_TYPE mult; + u32 nBits; + hwlm_group_t groups; +}; + +static really_inline +const u32 *getConfirmLitIndex(const struct FDRConfirm *fdrc) { + const u8 *base = (const u8 *)fdrc; + const u32 *litIndex = + (const u32 *)(base + ROUNDUP_N(sizeof(*fdrc), alignof(u32))); + assert(ISALIGNED(litIndex)); + return litIndex; +} + +#endif // FDR_CONFIRM_H diff --git a/regex/fdr/fdr_confirm_runtime.h b/regex/fdr/fdr_confirm_runtime.h new file mode 100644 index 000000000..5a2164952 --- /dev/null +++ b/regex/fdr/fdr_confirm_runtime.h @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2015-2019, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef FDR_CONFIRM_RUNTIME_H +#define FDR_CONFIRM_RUNTIME_H + +#include "scratch.h" +#include "fdr_internal.h" +#include "fdr_loadval.h" +#include "hwlm/hwlm.h" +#include "ue2common.h" +#include "util/bitutils.h" +#include "util/compare.h" + +// this is ordinary confirmation function which runs through +// the whole confirmation procedure +static really_inline +void confWithBit(const struct FDRConfirm *fdrc, const struct FDR_Runtime_Args *a, + size_t i, hwlmcb_rv_t *control, u32 *last_match, + u64a conf_key, u64a *conf, u8 bit) { + assert(i < a->len); + assert(i >= a->start_offset); + assert(ISALIGNED(fdrc)); + + const u8 * buf = a->buf; + u32 c = CONF_HASH_CALL(conf_key, fdrc->andmsk, fdrc->mult, + fdrc->nBits); + u32 start = getConfirmLitIndex(fdrc)[c]; + if (likely(!start)) { + return; + } + + const struct LitInfo *li + = (const struct LitInfo *)((const u8 *)fdrc + start); + + struct hs_scratch *scratch = a->scratch; + assert(!scratch->fdr_conf); + scratch->fdr_conf = conf; + scratch->fdr_conf_offset = bit; + u8 oldNext; // initialized in loop + do { + assert(ISALIGNED(li)); + + if (unlikely((conf_key & li->msk) != li->v)) { + goto out; + } + + if ((*last_match == li->id) && (li->flags & FDR_LIT_FLAG_NOREPEAT)) { + goto out; + } + + const u8 *loc = buf + i - li->size + 1; + + if (loc < buf) { + u32 full_overhang = buf - loc; + size_t len_history = a->len_history; + + // can't do a vectored confirm either if we don't have + // the bytes + if (full_overhang > len_history) { + goto out; + } + } + assert(li->size <= sizeof(CONF_TYPE)); + + if (unlikely(!(li->groups & *control))) { + goto out; + } + + *last_match = li->id; + *control = a->cb(i, li->id, scratch); + out: + oldNext = li->next; // oldNext is either 0 or an 'adjust' value + li++; + } while (oldNext); + scratch->fdr_conf = NULL; +} + +#endif diff --git a/regex/fdr/fdr_internal.h b/regex/fdr/fdr_internal.h new file mode 100644 index 000000000..c79f61c1f --- /dev/null +++ b/regex/fdr/fdr_internal.h @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief FDR literal matcher: data structures. + */ + +#ifndef FDR_INTERNAL_H +#define FDR_INTERNAL_H + +#include "ue2common.h" +#include "hwlm/hwlm.h" // for hwlm_group_t, HWLMCallback + +struct hs_scratch; + +typedef enum { + NOT_CAUTIOUS, //!< not near a boundary (quantify?) + VECTORING //!< potentially vectoring +} CautionReason; + +/** \brief number of different ids that can be triggered by floods of any given + * character. */ +#define FDR_FLOOD_MAX_IDS 16 + +struct FDRFlood { + hwlm_group_t allGroups; //!< all the groups or'd together + u32 suffix; + + /** \brief 0 to FDR_FLOOD_MAX_IDS-1 ids that are generated once per char on + * a flood. + * If larger we won't handle this through the flood path at all. */ + u16 idCount; + + u32 ids[FDR_FLOOD_MAX_IDS]; //!< the ids + hwlm_group_t groups[FDR_FLOOD_MAX_IDS]; //!< group ids to go with string ids +}; + +/** \brief FDR structure. + * + * 1. struct as-is + * 2. primary matching table + * 3. confirm stuff + */ +struct FDR { + u32 engineID; + u32 size; + u32 maxStringLen; + u32 numStrings; + u32 confOffset; + u32 floodOffset; + u8 stride; /* stride - how frequently the data is consulted by the first + * stage matcher */ + u8 domain; /* number of bits used to index into main FDR table. This value + * is used only of debugging/asserts. */ + u16 domainMask; /* pre-computed domain mask */ + u32 tabSize; /* pre-computed hashtable size in bytes */ + m128 start; /* initial start state to use at offset 0. The state has been + * set up based on the min length of buckets to reduce the need + * for pointless confirms. */ +}; + +/** \brief FDR runtime arguments. + * + * This structure handles read-only things that are passed extensively around + * the FDR run-time functions. They are set by the API, passed by value into + * the main function, then a pointer is passed around to all the various + * sub-functions (confirm & flood). */ +struct FDR_Runtime_Args { + const u8 *buf; + size_t len; + const u8 *buf_history; + size_t len_history; + size_t start_offset; + HWLMCallback cb; + struct hs_scratch *scratch; + const u8 *firstFloodDetect; + const u64a histBytes; +}; + +#endif diff --git a/regex/fdr/fdr_loadval.h b/regex/fdr/fdr_loadval.h new file mode 100644 index 000000000..86c39c7f3 --- /dev/null +++ b/regex/fdr/fdr_loadval.h @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2015-2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef FDR_LOADVAL_H +#define FDR_LOADVAL_H + +#include "ue2common.h" +#include "util/unaligned.h" + +#define MAKE_LOADVAL(type, name) \ + static really_inline \ + type name(const u8 *ptr, UNUSED const u8 *lo, UNUSED const u8 *hi) + +#define NORMAL_SAFE(type) \ + do { \ + assert(ptr >= lo); \ + assert(ptr + sizeof(type) - 1 < hi); \ + } while(0) + +#define MAKE_LOOP_CE(TYPE) \ + TYPE v = 0; \ + for (TYPE i = 0; i < sizeof(TYPE); i++) { \ + if ((lo <= ptr + i) && (ptr + i < hi)) { \ + v += (TYPE)ptr[i] << (i*8); \ + } \ + } \ + return v; + +// no suffix = normal (unaligned) +// _ce = cautious everywhere (in both directions); test against hi and lo + +MAKE_LOADVAL(u16, lv_u16) { + NORMAL_SAFE(u16); + return unaligned_load_u16(ptr); +} + +MAKE_LOADVAL(u64a, lv_u64a) { + NORMAL_SAFE(u32); + return unaligned_load_u64a(ptr); +} + +MAKE_LOADVAL(u16, lv_u16_ce) { MAKE_LOOP_CE(u16); } + +MAKE_LOADVAL(u64a, lv_u64a_ce) { MAKE_LOOP_CE(u64a); } + +#endif diff --git a/regex/fdr/flood_runtime.h b/regex/fdr/flood_runtime.h new file mode 100644 index 000000000..2d5a32d92 --- /dev/null +++ b/regex/fdr/flood_runtime.h @@ -0,0 +1,337 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef FLOOD_RUNTIME +#define FLOOD_RUNTIME + +#if defined(ARCH_64_BIT) +#define FLOOD_64 +#else +#define FLOOD_32 +#endif +#define FLOOD_MINIMUM_SIZE 256 +#define FLOOD_BACKOFF_START 32 + +static really_inline +const u8 * nextFloodDetect(const u8 * buf, size_t len, u32 floodBackoff) { + // if we don't have a flood at either the start or end, + // or have a very small buffer, don't bother with flood detection + if (len < FLOOD_MINIMUM_SIZE) { + return buf + len; + } + + /* entry points in runtime.c prefetch relevant data */ +#ifndef FLOOD_32 + u64a x11 = *(const u64a *)ROUNDUP_PTR(buf, 8); + u64a x12 = *(const u64a *)ROUNDUP_PTR(buf+8, 8); + if (x11 == x12) { + return buf + floodBackoff; + } + u64a x21 = *(const u64a *)ROUNDUP_PTR(buf + len/2, 8); + u64a x22 = *(const u64a *)ROUNDUP_PTR(buf + len/2 + 8, 8); + if (x21 == x22) { + return buf + floodBackoff; + } + u64a x31 = *(const u64a *)ROUNDUP_PTR(buf + len - 24, 8); + u64a x32 = *(const u64a *)ROUNDUP_PTR(buf + len - 16, 8); + if (x31 == x32) { + return buf + floodBackoff; + } +#else + u32 x11 = *(const u32 *)ROUNDUP_PTR(buf, 4); + u32 x12 = *(const u32 *)ROUNDUP_PTR(buf+4, 4); + if (x11 == x12) { + return buf + floodBackoff; + } + u32 x21 = *(const u32 *)ROUNDUP_PTR(buf + len/2, 4); + u32 x22 = *(const u32 *)ROUNDUP_PTR(buf + len/2 + 4, 4); + if (x21 == x22) { + return buf + floodBackoff; + } + u32 x31 = *(const u32 *)ROUNDUP_PTR(buf + len - 12, 4); + u32 x32 = *(const u32 *)ROUNDUP_PTR(buf + len - 8, 4); + if (x31 == x32) { + return buf + floodBackoff; + } +#endif + return buf + len; +} + +static really_inline +const u8 * floodDetect(const struct FDR * fdr, + const struct FDR_Runtime_Args * a, + const u8 ** ptrPtr, + const u8 * tryFloodDetect, + u32 * floodBackoffPtr, + hwlmcb_rv_t * control, + u32 iterBytes) { + DEBUG_PRINTF("attempting flood detection at %p\n", tryFloodDetect); + const u8 * buf = a->buf; + const size_t len = a->len; + HWLMCallback cb = a->cb; + struct hs_scratch *scratch = a->scratch; + + const u8 * ptr = *ptrPtr; + // tryFloodDetect is never put in places where unconditional + // reads a short distance forward or backward here + // TODO: rationale for this line needs to be rediscovered!! + size_t mainLoopLen = len > 2 * iterBytes ? len - 2 * iterBytes : 0; + const u32 i = ptr - buf; + u32 j = i; + + // go from c to our FDRFlood structure + u8 c = buf[i]; + const u8 * fBase = ((const u8 *)fdr) + fdr->floodOffset; + u32 fIdx = ((const u32 *)fBase)[c]; + const struct FDRFlood * fsb = (const struct FDRFlood *)(fBase + sizeof(u32) * 256); + const struct FDRFlood * fl = &fsb[fIdx]; + +#ifndef FLOOD_32 + u64a cmpVal = c; + cmpVal |= cmpVal << 8; + cmpVal |= cmpVal << 16; + cmpVal |= cmpVal << 32; + u64a probe = *(const u64a *)ROUNDUP_PTR(buf+i, 8); +#else + u32 cmpVal = c; + cmpVal |= cmpVal << 8; + cmpVal |= cmpVal << 16; + u32 probe = *(const u32 *)ROUNDUP_PTR(buf+i, 4); +#endif + + if ((probe != cmpVal) || (fl->idCount >= FDR_FLOOD_MAX_IDS)) { + *floodBackoffPtr *= 2; + goto floodout; + } + + if (i < fl->suffix + 7) { + *floodBackoffPtr *= 2; + goto floodout; + } + + j = i - fl->suffix; + +#ifndef FLOOD_32 + j -= (u32)((uintptr_t)buf + j) & 0x7; // push j back to yield 8-aligned addrs + for (; j + 32 < mainLoopLen; j += 32) { + u64a v = *(const u64a *)(buf + j); + u64a v2 = *(const u64a *)(buf + j + 8); + u64a v3 = *(const u64a *)(buf + j + 16); + u64a v4 = *(const u64a *)(buf + j + 24); + if ((v4 != cmpVal) || (v3 != cmpVal) || (v2 != cmpVal) || (v != cmpVal)) { + break; + } + } + for (; j + 8 < mainLoopLen; j += 8) { + u64a v = *(const u64a *)(buf + j); + if (v != cmpVal) { + break; + } + } +#else + j -= (u32)((size_t)buf + j) & 0x3; // push j back to yield 4-aligned addrs + for (; j + 16 < mainLoopLen; j += 16) { + u32 v = *(const u32 *)(buf + j); + u32 v2 = *(const u32 *)(buf + j + 4); + u32 v3 = *(const u32 *)(buf + j + 8); + u32 v4 = *(const u32 *)(buf + j + 12); + if ((v4 != cmpVal) || (v3 != cmpVal) || (v2 != cmpVal) || (v != cmpVal)) { + break; + } + } + for (; j + 4 < mainLoopLen; j += 4) { + u32 v = *(const u32 *)(buf + j); + if (v != cmpVal) { + break; + } + } +#endif + for (; j < mainLoopLen; j++) { + u8 v = *(const u8 *)(buf + j); + if (v != c) { + break; + } + } + if (j > i ) { + j--; // needed for some reaches + u32 itersAhead = (j-i)/iterBytes; + u32 floodSize = itersAhead*iterBytes; + + DEBUG_PRINTF("flooding %u size j %u i %u fl->idCount %hu " + "*control %016llx fl->allGroups %016llx\n", + floodSize, j, i, fl->idCount, *control, fl->allGroups); + DEBUG_PRINTF("mainloopLen %zu mainStart ??? mainEnd ??? len %zu\n", + mainLoopLen, len); + + if (fl->idCount && (*control & fl->allGroups)) { + switch (fl->idCount) { +#if !defined(FLOOD_DEBUG) + // Carefully unrolled code + case 1: + for (u32 t = 0; t < floodSize && (*control & fl->allGroups); + t += 4) { + DEBUG_PRINTF("aaa %u %llx\n", t, fl->groups[0]); + if (*control & fl->groups[0]) { + *control = cb(i + t + 0, fl->ids[0], scratch); + } + if (*control & fl->groups[0]) { + *control = cb(i + t + 1, fl->ids[0], scratch); + } + if (*control & fl->groups[0]) { + *control = cb(i + t + 2, fl->ids[0], scratch); + } + if (*control & fl->groups[0]) { + *control = cb(i + t + 3, fl->ids[0], scratch); + } + } + break; + case 2: + for (u32 t = 0; t < floodSize && (*control & fl->allGroups); t += 4) { + if (*control & fl->groups[0]) { + *control = cb(i + t, fl->ids[0], scratch); + } + if (*control & fl->groups[1]) { + *control = cb(i + t, fl->ids[1], scratch); + } + if (*control & fl->groups[0]) { + *control = + cb(i + t + 1, fl->ids[0], scratch); + } + if (*control & fl->groups[1]) { + *control = cb(i + t + 1, fl->ids[1], scratch); + } + if (*control & fl->groups[0]) { + *control = cb(i + t + 2, fl->ids[0], scratch); + } + if (*control & fl->groups[1]) { + *control = cb(i + t + 2, fl->ids[1], scratch); + } + if (*control & fl->groups[0]) { + *control = cb(i + t + 3, fl->ids[0], scratch); + } + if (*control & fl->groups[1]) { + *control = cb(i + t + 3, fl->ids[1], scratch); + } + } + break; + case 3: + for (u32 t = 0; t < floodSize && (*control & fl->allGroups); t += 2) { + if (*control & fl->groups[0]) { + *control = cb(i + t, fl->ids[0], scratch); + } + if (*control & fl->groups[1]) { + *control = cb(i + t, fl->ids[1], scratch); + } + if (*control & fl->groups[2]) { + *control = cb(i + t, fl->ids[2], scratch); + } + if (*control & fl->groups[0]) { + *control = cb(i + t + 1, fl->ids[0], scratch); + } + if (*control & fl->groups[1]) { + *control = cb(i + t + 1, fl->ids[1], scratch); + } + if (*control & fl->groups[2]) { + *control = cb(i + t + 1, fl->ids[2], scratch); + } + } + break; + default: + // slow generalized loop + for (u32 t = 0; t < floodSize && (*control & fl->allGroups); t += 2) { + + if (*control & fl->groups[0]) { + *control = cb(i + t, fl->ids[0], scratch); + } + if (*control & fl->groups[1]) { + *control = cb(i + t, fl->ids[1], scratch); + } + if (*control & fl->groups[2]) { + *control = cb(i + t, fl->ids[2], scratch); + } + if (*control & fl->groups[3]) { + *control = cb(i + t, fl->ids[3], scratch); + } + + for (u32 t2 = 4; t2 < fl->idCount; t2++) { + if (*control & fl->groups[t2]) { + *control = cb(i + t, fl->ids[t2], scratch); + } + } + + if (*control & fl->groups[0]) { + *control = cb(i + t + 1, fl->ids[0], scratch); + } + if (*control & fl->groups[1]) { + *control = cb(i + t + 1, fl->ids[1], scratch); + } + if (*control & fl->groups[2]) { + *control = cb(i + t + 1, fl->ids[2], scratch); + } + if (*control & fl->groups[3]) { + *control = cb(i + t + 1, fl->ids[3], scratch); + } + + for (u32 t2 = 4; t2 < fl->idCount; t2++) { + if (*control & fl->groups[t2]) { + *control = cb(i + t + 1, fl->ids[t2], scratch); + } + } + } + break; +#else + // Fallback for debugging + default: + for (u32 t = 0; t < floodSize && (*control & fl->allGroups); t++) { + for (u32 t2 = 0; t2 < fl->idCount; t2++) { + if (*control & fl->groups[t2]) { + *control = cb(i + t, fl->ids[t2], scratch); + } + } + } +#endif + } + } + ptr += floodSize; + } else { + *floodBackoffPtr *= 2; + } + +floodout: + if (j + *floodBackoffPtr < mainLoopLen - 128) { + tryFloodDetect = buf + MAX(i,j) + *floodBackoffPtr; + } else { + tryFloodDetect = buf + mainLoopLen; // set so we never do another flood detect + } + *ptrPtr = ptr; + DEBUG_PRINTF("finished flood detection at %p (next check %p)\n", + ptr, tryFloodDetect); + return tryFloodDetect; +} + +#endif diff --git a/regex/fdr/teddy.c b/regex/fdr/teddy.c new file mode 100644 index 000000000..e6f547619 --- /dev/null +++ b/regex/fdr/teddy.c @@ -0,0 +1,1114 @@ +/* + * Copyright (c) 2015-2020, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Teddy literal matcher: SSSE3 engine runtime. + */ + +#include "fdr_internal.h" +#include "flood_runtime.h" +#include "teddy.h" +#include "teddy_internal.h" +#include "teddy_runtime_common.h" +#include "util/simd_utils.h" + +const u8 ALIGN_DIRECTIVE p_mask_arr[17][32] = { + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00} +}; + +#if defined(HAVE_AVX512VBMI) // VBMI strong teddy + +#define CONF_CHUNK_64(chunk, bucket, off, reason, pt, conf_fn) \ +do { \ + if (unlikely(chunk != ones_u64a)) { \ + chunk = ~chunk; \ + conf_fn(&chunk, bucket, off, confBase, reason, a, pt, \ + &control, &last_match); \ + CHECK_HWLM_TERMINATE_MATCHING; \ + } \ +} while(0) + +#define CONF_CHUNK_32(chunk, bucket, off, reason, pt, conf_fn) \ +do { \ + if (unlikely(chunk != ones_u32)) { \ + chunk = ~chunk; \ + conf_fn(&chunk, bucket, off, confBase, reason, a, pt, \ + &control, &last_match); \ + CHECK_HWLM_TERMINATE_MATCHING; \ + } \ +} while(0) + +#else + +#define CONF_CHUNK_64(chunk, bucket, off, reason, conf_fn) \ +do { \ + if (unlikely(chunk != ones_u64a)) { \ + chunk = ~chunk; \ + conf_fn(&chunk, bucket, off, confBase, reason, a, ptr, \ + &control, &last_match); \ + CHECK_HWLM_TERMINATE_MATCHING; \ + } \ +} while(0) + +#define CONF_CHUNK_32(chunk, bucket, off, reason, conf_fn) \ +do { \ + if (unlikely(chunk != ones_u32)) { \ + chunk = ~chunk; \ + conf_fn(&chunk, bucket, off, confBase, reason, a, ptr, \ + &control, &last_match); \ + CHECK_HWLM_TERMINATE_MATCHING; \ + } \ +} while(0) + +#endif + +#if defined(HAVE_AVX512VBMI) // VBMI strong teddy + +#ifdef ARCH_64_BIT +#define CONFIRM_TEDDY(var, bucket, offset, reason, pt, conf_fn) \ +do { \ + if (unlikely(diff512(var, ones512()))) { \ + m128 p128_0 = extract128from512(var, 0); \ + m128 p128_1 = extract128from512(var, 1); \ + m128 p128_2 = extract128from512(var, 2); \ + m128 p128_3 = extract128from512(var, 3); \ + u64a part1 = movq(p128_0); \ + u64a part2 = movq(rshiftbyte_m128(p128_0, 8)); \ + u64a part3 = movq(p128_1); \ + u64a part4 = movq(rshiftbyte_m128(p128_1, 8)); \ + u64a part5 = movq(p128_2); \ + u64a part6 = movq(rshiftbyte_m128(p128_2, 8)); \ + u64a part7 = movq(p128_3); \ + u64a part8 = movq(rshiftbyte_m128(p128_3, 8)); \ + CONF_CHUNK_64(part1, bucket, offset, reason, pt, conf_fn); \ + CONF_CHUNK_64(part2, bucket, offset + 8, reason, pt, conf_fn); \ + CONF_CHUNK_64(part3, bucket, offset + 16, reason, pt, conf_fn); \ + CONF_CHUNK_64(part4, bucket, offset + 24, reason, pt, conf_fn); \ + CONF_CHUNK_64(part5, bucket, offset + 32, reason, pt, conf_fn); \ + CONF_CHUNK_64(part6, bucket, offset + 40, reason, pt, conf_fn); \ + CONF_CHUNK_64(part7, bucket, offset + 48, reason, pt, conf_fn); \ + CONF_CHUNK_64(part8, bucket, offset + 56, reason, pt, conf_fn); \ + } \ +} while(0) +#else +#define CONFIRM_TEDDY(var, bucket, offset, reason, pt, conf_fn) \ +do { \ + if (unlikely(diff512(var, ones512()))) { \ + m128 p128_0 = extract128from512(var, 0); \ + m128 p128_1 = extract128from512(var, 1); \ + m128 p128_2 = extract128from512(var, 2); \ + m128 p128_3 = extract128from512(var, 3); \ + u32 part1 = movd(p128_0); \ + u32 part2 = movd(rshiftbyte_m128(p128_0, 4)); \ + u32 part3 = movd(rshiftbyte_m128(p128_0, 8)); \ + u32 part4 = movd(rshiftbyte_m128(p128_0, 12)); \ + u32 part5 = movd(p128_1); \ + u32 part6 = movd(rshiftbyte_m128(p128_1, 4)); \ + u32 part7 = movd(rshiftbyte_m128(p128_1, 8)); \ + u32 part8 = movd(rshiftbyte_m128(p128_1, 12)); \ + u32 part9 = movd(p128_2); \ + u32 part10 = movd(rshiftbyte_m128(p128_2, 4)); \ + u32 part11 = movd(rshiftbyte_m128(p128_2, 8)); \ + u32 part12 = movd(rshiftbyte_m128(p128_2, 12)); \ + u32 part13 = movd(p128_3); \ + u32 part14 = movd(rshiftbyte_m128(p128_3, 4)); \ + u32 part15 = movd(rshiftbyte_m128(p128_3, 8)); \ + u32 part16 = movd(rshiftbyte_m128(p128_3, 12)); \ + CONF_CHUNK_32(part1, bucket, offset, reason, pt, conf_fn); \ + CONF_CHUNK_32(part2, bucket, offset + 4, reason, pt, conf_fn); \ + CONF_CHUNK_32(part3, bucket, offset + 8, reason, pt, conf_fn); \ + CONF_CHUNK_32(part4, bucket, offset + 12, reason, pt, conf_fn); \ + CONF_CHUNK_32(part5, bucket, offset + 16, reason, pt, conf_fn); \ + CONF_CHUNK_32(part6, bucket, offset + 20, reason, pt, conf_fn); \ + CONF_CHUNK_32(part7, bucket, offset + 24, reason, pt, conf_fn); \ + CONF_CHUNK_32(part8, bucket, offset + 28, reason, pt, conf_fn); \ + CONF_CHUNK_32(part9, bucket, offset + 32, reason, pt, conf_fn); \ + CONF_CHUNK_32(part10, bucket, offset + 36, reason, pt, conf_fn); \ + CONF_CHUNK_32(part11, bucket, offset + 40, reason, pt, conf_fn); \ + CONF_CHUNK_32(part12, bucket, offset + 44, reason, pt, conf_fn); \ + CONF_CHUNK_32(part13, bucket, offset + 48, reason, pt, conf_fn); \ + CONF_CHUNK_32(part14, bucket, offset + 52, reason, pt, conf_fn); \ + CONF_CHUNK_32(part15, bucket, offset + 56, reason, pt, conf_fn); \ + CONF_CHUNK_32(part16, bucket, offset + 60, reason, pt, conf_fn); \ + } \ +} while(0) +#endif + +#define PREP_SHUF_MASK \ + m512 lo = and512(val, *lo_mask); \ + m512 hi = and512(rshift64_m512(val, 4), *lo_mask) + +#define TEDDY_VBMI_PSHUFB_OR_M1 \ + m512 shuf_or_b0 = or512(pshufb_m512(dup_mask[0], lo), \ + pshufb_m512(dup_mask[1], hi)); + +#define TEDDY_VBMI_PSHUFB_OR_M2 \ + TEDDY_VBMI_PSHUFB_OR_M1 \ + m512 shuf_or_b1 = or512(pshufb_m512(dup_mask[2], lo), \ + pshufb_m512(dup_mask[3], hi)); + +#define TEDDY_VBMI_PSHUFB_OR_M3 \ + TEDDY_VBMI_PSHUFB_OR_M2 \ + m512 shuf_or_b2 = or512(pshufb_m512(dup_mask[4], lo), \ + pshufb_m512(dup_mask[5], hi)); + +#define TEDDY_VBMI_PSHUFB_OR_M4 \ + TEDDY_VBMI_PSHUFB_OR_M3 \ + m512 shuf_or_b3 = or512(pshufb_m512(dup_mask[6], lo), \ + pshufb_m512(dup_mask[7], hi)); + +#define TEDDY_VBMI_SL1_MASK 0xfffffffffffffffeULL +#define TEDDY_VBMI_SL2_MASK 0xfffffffffffffffcULL +#define TEDDY_VBMI_SL3_MASK 0xfffffffffffffff8ULL + +#define TEDDY_VBMI_SHIFT_M1 + +#define TEDDY_VBMI_SHIFT_M2 \ + TEDDY_VBMI_SHIFT_M1 \ + m512 sl1 = maskz_vpermb512(TEDDY_VBMI_SL1_MASK, sl_msk[0], shuf_or_b1); + +#define TEDDY_VBMI_SHIFT_M3 \ + TEDDY_VBMI_SHIFT_M2 \ + m512 sl2 = maskz_vpermb512(TEDDY_VBMI_SL2_MASK, sl_msk[1], shuf_or_b2); + +#define TEDDY_VBMI_SHIFT_M4 \ + TEDDY_VBMI_SHIFT_M3 \ + m512 sl3 = maskz_vpermb512(TEDDY_VBMI_SL3_MASK, sl_msk[2], shuf_or_b3); + +#define SHIFT_OR_M1 \ + shuf_or_b0 + +#define SHIFT_OR_M2 \ + or512(sl1, SHIFT_OR_M1) + +#define SHIFT_OR_M3 \ + or512(sl2, SHIFT_OR_M2) + +#define SHIFT_OR_M4 \ + or512(sl3, SHIFT_OR_M3) + +static really_inline +m512 prep_conf_teddy_m1(const m512 *lo_mask, const m512 *dup_mask, + UNUSED const m512 *sl_msk, const m512 val) { + PREP_SHUF_MASK; + TEDDY_VBMI_PSHUFB_OR_M1; + TEDDY_VBMI_SHIFT_M1; + return SHIFT_OR_M1; +} + +static really_inline +m512 prep_conf_teddy_m2(const m512 *lo_mask, const m512 *dup_mask, + const m512 *sl_msk, const m512 val) { + PREP_SHUF_MASK; + TEDDY_VBMI_PSHUFB_OR_M2; + TEDDY_VBMI_SHIFT_M2; + return SHIFT_OR_M2; +} + +static really_inline +m512 prep_conf_teddy_m3(const m512 *lo_mask, const m512 *dup_mask, + const m512 *sl_msk, const m512 val) { + PREP_SHUF_MASK; + TEDDY_VBMI_PSHUFB_OR_M3; + TEDDY_VBMI_SHIFT_M3; + return SHIFT_OR_M3; +} + +static really_inline +m512 prep_conf_teddy_m4(const m512 *lo_mask, const m512 *dup_mask, + const m512 *sl_msk, const m512 val) { + PREP_SHUF_MASK; + TEDDY_VBMI_PSHUFB_OR_M4; + TEDDY_VBMI_SHIFT_M4; + return SHIFT_OR_M4; +} + +#define PREP_CONF_FN(val, n) \ + prep_conf_teddy_m##n(&lo_mask, dup_mask, sl_msk, val) + +#define TEDDY_VBMI_SL1_POS 15 +#define TEDDY_VBMI_SL2_POS 14 +#define TEDDY_VBMI_SL3_POS 13 + +#define TEDDY_VBMI_LOAD_SHIFT_MASK_M1 + +#define TEDDY_VBMI_LOAD_SHIFT_MASK_M2 \ + TEDDY_VBMI_LOAD_SHIFT_MASK_M1 \ + sl_msk[0] = loadu512(p_sh_mask_arr + TEDDY_VBMI_SL1_POS); + +#define TEDDY_VBMI_LOAD_SHIFT_MASK_M3 \ + TEDDY_VBMI_LOAD_SHIFT_MASK_M2 \ + sl_msk[1] = loadu512(p_sh_mask_arr + TEDDY_VBMI_SL2_POS); + +#define TEDDY_VBMI_LOAD_SHIFT_MASK_M4 \ + TEDDY_VBMI_LOAD_SHIFT_MASK_M3 \ + sl_msk[2] = loadu512(p_sh_mask_arr + TEDDY_VBMI_SL3_POS); + +#define PREPARE_MASKS_1 \ + dup_mask[0] = set4x128(maskBase[0]); \ + dup_mask[1] = set4x128(maskBase[1]); + +#define PREPARE_MASKS_2 \ + PREPARE_MASKS_1 \ + dup_mask[2] = set4x128(maskBase[2]); \ + dup_mask[3] = set4x128(maskBase[3]); + +#define PREPARE_MASKS_3 \ + PREPARE_MASKS_2 \ + dup_mask[4] = set4x128(maskBase[4]); \ + dup_mask[5] = set4x128(maskBase[5]); + +#define PREPARE_MASKS_4 \ + PREPARE_MASKS_3 \ + dup_mask[6] = set4x128(maskBase[6]); \ + dup_mask[7] = set4x128(maskBase[7]); + +#define PREPARE_MASKS(n) \ + m512 lo_mask = set64x8(0xf); \ + m512 dup_mask[n * 2]; \ + m512 sl_msk[n - 1]; \ + PREPARE_MASKS_##n \ + TEDDY_VBMI_LOAD_SHIFT_MASK_M##n + +#define TEDDY_VBMI_CONF_MASK_HEAD (0xffffffffffffffffULL >> n_sh) +#define TEDDY_VBMI_CONF_MASK_FULL (0xffffffffffffffffULL << n_sh) +#define TEDDY_VBMI_CONF_MASK_VAR(n) (0xffffffffffffffffULL >> (64 - n) << overlap) +#define TEDDY_VBMI_LOAD_MASK_PATCH (0xffffffffffffffffULL >> (64 - n_sh)) + +#define FDR_EXEC_TEDDY(fdr, a, control, n_msk, conf_fn) \ +do { \ + const u8 *buf_end = a->buf + a->len; \ + const u8 *ptr = a->buf + a->start_offset; \ + u32 floodBackoff = FLOOD_BACKOFF_START; \ + const u8 *tryFloodDetect = a->firstFloodDetect; \ + u32 last_match = ones_u32; \ + const struct Teddy *teddy = (const struct Teddy *)fdr; \ + const size_t iterBytes = 64; \ + u32 n_sh = n_msk - 1; \ + const size_t loopBytes = 64 - n_sh; \ + DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n", \ + a->buf, a->len, a->start_offset); \ + \ + const m128 *maskBase = getMaskBase(teddy); \ + PREPARE_MASKS(n_msk); \ + const u32 *confBase = getConfBase(teddy); \ + \ + u64a k = TEDDY_VBMI_CONF_MASK_FULL; \ + m512 p_mask = set_mask_m512(~k); \ + u32 overlap = 0; \ + u64a patch = 0; \ + if (likely(ptr + loopBytes <= buf_end)) { \ + m512 p_mask0 = set_mask_m512(~TEDDY_VBMI_CONF_MASK_HEAD); \ + m512 r_0 = PREP_CONF_FN(loadu512(ptr), n_msk); \ + r_0 = or512(r_0, p_mask0); \ + CONFIRM_TEDDY(r_0, 8, 0, VECTORING, ptr, conf_fn); \ + ptr += loopBytes; \ + overlap = n_sh; \ + patch = TEDDY_VBMI_LOAD_MASK_PATCH; \ + } \ + \ + for (; ptr + loopBytes <= buf_end; ptr += loopBytes) { \ + __builtin_prefetch(ptr - n_sh + (64 * 2)); \ + CHECK_FLOOD; \ + m512 r_0 = PREP_CONF_FN(loadu512(ptr - n_sh), n_msk); \ + r_0 = or512(r_0, p_mask); \ + CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, ptr - n_sh, conf_fn); \ + } \ + \ + assert(ptr + loopBytes > buf_end); \ + if (ptr < buf_end) { \ + u32 left = (u32)(buf_end - ptr); \ + u64a k1 = TEDDY_VBMI_CONF_MASK_VAR(left); \ + m512 p_mask1 = set_mask_m512(~k1); \ + m512 val_0 = loadu_maskz_m512(k1 | patch, ptr - overlap); \ + m512 r_0 = PREP_CONF_FN(val_0, n_msk); \ + r_0 = or512(r_0, p_mask1); \ + CONFIRM_TEDDY(r_0, 8, 0, VECTORING, ptr - overlap, conf_fn); \ + } \ + \ + return HWLM_SUCCESS; \ +} while(0) + +#elif defined(HAVE_AVX512) // AVX512 reinforced teddy + +#ifdef ARCH_64_BIT +#define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn) \ +do { \ + if (unlikely(diff512(var, ones512()))) { \ + m128 p128_0 = extract128from512(var, 0); \ + m128 p128_1 = extract128from512(var, 1); \ + m128 p128_2 = extract128from512(var, 2); \ + m128 p128_3 = extract128from512(var, 3); \ + u64a part1 = movq(p128_0); \ + u64a part2 = movq(rshiftbyte_m128(p128_0, 8)); \ + u64a part3 = movq(p128_1); \ + u64a part4 = movq(rshiftbyte_m128(p128_1, 8)); \ + u64a part5 = movq(p128_2); \ + u64a part6 = movq(rshiftbyte_m128(p128_2, 8)); \ + u64a part7 = movq(p128_3); \ + u64a part8 = movq(rshiftbyte_m128(p128_3, 8)); \ + CONF_CHUNK_64(part1, bucket, offset, reason, conf_fn); \ + CONF_CHUNK_64(part2, bucket, offset + 8, reason, conf_fn); \ + CONF_CHUNK_64(part3, bucket, offset + 16, reason, conf_fn); \ + CONF_CHUNK_64(part4, bucket, offset + 24, reason, conf_fn); \ + CONF_CHUNK_64(part5, bucket, offset + 32, reason, conf_fn); \ + CONF_CHUNK_64(part6, bucket, offset + 40, reason, conf_fn); \ + CONF_CHUNK_64(part7, bucket, offset + 48, reason, conf_fn); \ + CONF_CHUNK_64(part8, bucket, offset + 56, reason, conf_fn); \ + } \ +} while(0) +#else +#define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn) \ +do { \ + if (unlikely(diff512(var, ones512()))) { \ + m128 p128_0 = extract128from512(var, 0); \ + m128 p128_1 = extract128from512(var, 1); \ + m128 p128_2 = extract128from512(var, 2); \ + m128 p128_3 = extract128from512(var, 3); \ + u32 part1 = movd(p128_0); \ + u32 part2 = movd(rshiftbyte_m128(p128_0, 4)); \ + u32 part3 = movd(rshiftbyte_m128(p128_0, 8)); \ + u32 part4 = movd(rshiftbyte_m128(p128_0, 12)); \ + u32 part5 = movd(p128_1); \ + u32 part6 = movd(rshiftbyte_m128(p128_1, 4)); \ + u32 part7 = movd(rshiftbyte_m128(p128_1, 8)); \ + u32 part8 = movd(rshiftbyte_m128(p128_1, 12)); \ + u32 part9 = movd(p128_2); \ + u32 part10 = movd(rshiftbyte_m128(p128_2, 4)); \ + u32 part11 = movd(rshiftbyte_m128(p128_2, 8)); \ + u32 part12 = movd(rshiftbyte_m128(p128_2, 12)); \ + u32 part13 = movd(p128_3); \ + u32 part14 = movd(rshiftbyte_m128(p128_3, 4)); \ + u32 part15 = movd(rshiftbyte_m128(p128_3, 8)); \ + u32 part16 = movd(rshiftbyte_m128(p128_3, 12)); \ + CONF_CHUNK_32(part1, bucket, offset, reason, conf_fn); \ + CONF_CHUNK_32(part2, bucket, offset + 4, reason, conf_fn); \ + CONF_CHUNK_32(part3, bucket, offset + 8, reason, conf_fn); \ + CONF_CHUNK_32(part4, bucket, offset + 12, reason, conf_fn); \ + CONF_CHUNK_32(part5, bucket, offset + 16, reason, conf_fn); \ + CONF_CHUNK_32(part6, bucket, offset + 20, reason, conf_fn); \ + CONF_CHUNK_32(part7, bucket, offset + 24, reason, conf_fn); \ + CONF_CHUNK_32(part8, bucket, offset + 28, reason, conf_fn); \ + CONF_CHUNK_32(part9, bucket, offset + 32, reason, conf_fn); \ + CONF_CHUNK_32(part10, bucket, offset + 36, reason, conf_fn); \ + CONF_CHUNK_32(part11, bucket, offset + 40, reason, conf_fn); \ + CONF_CHUNK_32(part12, bucket, offset + 44, reason, conf_fn); \ + CONF_CHUNK_32(part13, bucket, offset + 48, reason, conf_fn); \ + CONF_CHUNK_32(part14, bucket, offset + 52, reason, conf_fn); \ + CONF_CHUNK_32(part15, bucket, offset + 56, reason, conf_fn); \ + CONF_CHUNK_32(part16, bucket, offset + 60, reason, conf_fn); \ + } \ +} while(0) +#endif + +#define PREP_SHUF_MASK_NO_REINFORCEMENT(val) \ + m512 lo = and512(val, *lo_mask); \ + m512 hi = and512(rshift64_m512(val, 4), *lo_mask) + +#define PREP_SHUF_MASK \ + PREP_SHUF_MASK_NO_REINFORCEMENT(load512(ptr)); \ + *c_16 = *(ptr + 15); \ + *c_32 = *(ptr + 31); \ + *c_48 = *(ptr + 47); \ + m512 r_msk = set512_64(0ULL, r_msk_base[*c_48], 0ULL, r_msk_base[*c_32],\ + 0ULL, r_msk_base[*c_16], 0ULL, r_msk_base[*c_0]);\ + *c_0 = *(ptr + 63) + +#define SHIFT_OR_M1 \ + or512(pshufb_m512(dup_mask[0], lo), pshufb_m512(dup_mask[1], hi)) + +#define SHIFT_OR_M2 \ + or512(lshift128_m512(or512(pshufb_m512(dup_mask[2], lo), \ + pshufb_m512(dup_mask[3], hi)), \ + 1), SHIFT_OR_M1) + +#define SHIFT_OR_M3 \ + or512(lshift128_m512(or512(pshufb_m512(dup_mask[4], lo), \ + pshufb_m512(dup_mask[5], hi)), \ + 2), SHIFT_OR_M2) + +#define SHIFT_OR_M4 \ + or512(lshift128_m512(or512(pshufb_m512(dup_mask[6], lo), \ + pshufb_m512(dup_mask[7], hi)), \ + 3), SHIFT_OR_M3) + +static really_inline +m512 prep_conf_teddy_no_reinforcement_m1(const m512 *lo_mask, + const m512 *dup_mask, + const m512 val) { + PREP_SHUF_MASK_NO_REINFORCEMENT(val); + return SHIFT_OR_M1; +} + +static really_inline +m512 prep_conf_teddy_no_reinforcement_m2(const m512 *lo_mask, + const m512 *dup_mask, + const m512 val) { + PREP_SHUF_MASK_NO_REINFORCEMENT(val); + return SHIFT_OR_M2; +} + +static really_inline +m512 prep_conf_teddy_no_reinforcement_m3(const m512 *lo_mask, + const m512 *dup_mask, + const m512 val) { + PREP_SHUF_MASK_NO_REINFORCEMENT(val); + return SHIFT_OR_M3; +} + +static really_inline +m512 prep_conf_teddy_no_reinforcement_m4(const m512 *lo_mask, + const m512 *dup_mask, + const m512 val) { + PREP_SHUF_MASK_NO_REINFORCEMENT(val); + return SHIFT_OR_M4; +} + +static really_inline +m512 prep_conf_teddy_m1(const m512 *lo_mask, const m512 *dup_mask, + const u8 *ptr, const u64a *r_msk_base, + u32 *c_0, u32 *c_16, u32 *c_32, u32 *c_48) { + PREP_SHUF_MASK; + return or512(SHIFT_OR_M1, r_msk); +} + +static really_inline +m512 prep_conf_teddy_m2(const m512 *lo_mask, const m512 *dup_mask, + const u8 *ptr, const u64a *r_msk_base, + u32 *c_0, u32 *c_16, u32 *c_32, u32 *c_48) { + PREP_SHUF_MASK; + return or512(SHIFT_OR_M2, r_msk); +} + +static really_inline +m512 prep_conf_teddy_m3(const m512 *lo_mask, const m512 *dup_mask, + const u8 *ptr, const u64a *r_msk_base, + u32 *c_0, u32 *c_16, u32 *c_32, u32 *c_48) { + PREP_SHUF_MASK; + return or512(SHIFT_OR_M3, r_msk); +} + +static really_inline +m512 prep_conf_teddy_m4(const m512 *lo_mask, const m512 *dup_mask, + const u8 *ptr, const u64a *r_msk_base, + u32 *c_0, u32 *c_16, u32 *c_32, u32 *c_48) { + PREP_SHUF_MASK; + return or512(SHIFT_OR_M4, r_msk); +} + +#define PREP_CONF_FN_NO_REINFORCEMENT(val, n) \ + prep_conf_teddy_no_reinforcement_m##n(&lo_mask, dup_mask, val) + +#define PREP_CONF_FN(ptr, n) \ + prep_conf_teddy_m##n(&lo_mask, dup_mask, ptr, r_msk_base, \ + &c_0, &c_16, &c_32, &c_48) + +#define PREPARE_MASKS_1 \ + dup_mask[0] = set4x128(maskBase[0]); \ + dup_mask[1] = set4x128(maskBase[1]); + +#define PREPARE_MASKS_2 \ + PREPARE_MASKS_1 \ + dup_mask[2] = set4x128(maskBase[2]); \ + dup_mask[3] = set4x128(maskBase[3]); + +#define PREPARE_MASKS_3 \ + PREPARE_MASKS_2 \ + dup_mask[4] = set4x128(maskBase[4]); \ + dup_mask[5] = set4x128(maskBase[5]); + +#define PREPARE_MASKS_4 \ + PREPARE_MASKS_3 \ + dup_mask[6] = set4x128(maskBase[6]); \ + dup_mask[7] = set4x128(maskBase[7]); + +#define PREPARE_MASKS(n) \ + m512 lo_mask = set64x8(0xf); \ + m512 dup_mask[n * 2]; \ + PREPARE_MASKS_##n + +#define FDR_EXEC_TEDDY(fdr, a, control, n_msk, conf_fn) \ +do { \ + const u8 *buf_end = a->buf + a->len; \ + const u8 *ptr = a->buf + a->start_offset; \ + u32 floodBackoff = FLOOD_BACKOFF_START; \ + const u8 *tryFloodDetect = a->firstFloodDetect; \ + u32 last_match = ones_u32; \ + const struct Teddy *teddy = (const struct Teddy *)fdr; \ + const size_t iterBytes = 128; \ + DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n", \ + a->buf, a->len, a->start_offset); \ + \ + const m128 *maskBase = getMaskBase(teddy); \ + PREPARE_MASKS(n_msk); \ + const u32 *confBase = getConfBase(teddy); \ + \ + const u64a *r_msk_base = getReinforcedMaskBase(teddy, n_msk); \ + u32 c_0 = 0x100; \ + u32 c_16 = 0x100; \ + u32 c_32 = 0x100; \ + u32 c_48 = 0x100; \ + const u8 *mainStart = ROUNDUP_PTR(ptr, 64); \ + DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart); \ + if (ptr < mainStart) { \ + ptr = mainStart - 64; \ + m512 p_mask; \ + m512 val_0 = vectoredLoad512(&p_mask, ptr, a->start_offset, \ + a->buf, buf_end, \ + a->buf_history, a->len_history, n_msk); \ + m512 r_0 = PREP_CONF_FN_NO_REINFORCEMENT(val_0, n_msk); \ + r_0 = or512(r_0, p_mask); \ + CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn); \ + ptr += 64; \ + } \ + \ + if (ptr + 64 <= buf_end) { \ + m512 r_0 = PREP_CONF_FN(ptr, n_msk); \ + CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn); \ + ptr += 64; \ + } \ + \ + for (; ptr + iterBytes <= buf_end; ptr += iterBytes) { \ + __builtin_prefetch(ptr + (iterBytes * 4)); \ + CHECK_FLOOD; \ + m512 r_0 = PREP_CONF_FN(ptr, n_msk); \ + CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, conf_fn); \ + m512 r_1 = PREP_CONF_FN(ptr + 64, n_msk); \ + CONFIRM_TEDDY(r_1, 8, 64, NOT_CAUTIOUS, conf_fn); \ + } \ + \ + if (ptr + 64 <= buf_end) { \ + m512 r_0 = PREP_CONF_FN(ptr, n_msk); \ + CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, conf_fn); \ + ptr += 64; \ + } \ + \ + assert(ptr + 64 > buf_end); \ + if (ptr < buf_end) { \ + m512 p_mask; \ + m512 val_0 = vectoredLoad512(&p_mask, ptr, 0, ptr, buf_end, \ + a->buf_history, a->len_history, n_msk); \ + m512 r_0 = PREP_CONF_FN_NO_REINFORCEMENT(val_0, n_msk); \ + r_0 = or512(r_0, p_mask); \ + CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn); \ + } \ + \ + return HWLM_SUCCESS; \ +} while(0) + +#elif defined(HAVE_AVX2) // not HAVE_AVX512 but HAVE_AVX2 reinforced teddy + +#ifdef ARCH_64_BIT +#define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn) \ +do { \ + if (unlikely(diff256(var, ones256()))) { \ + m128 lo = movdq_lo(var); \ + m128 hi = movdq_hi(var); \ + u64a part1 = movq(lo); \ + u64a part2 = movq(rshiftbyte_m128(lo, 8)); \ + u64a part3 = movq(hi); \ + u64a part4 = movq(rshiftbyte_m128(hi, 8)); \ + CONF_CHUNK_64(part1, bucket, offset, reason, conf_fn); \ + CONF_CHUNK_64(part2, bucket, offset + 8, reason, conf_fn); \ + CONF_CHUNK_64(part3, bucket, offset + 16, reason, conf_fn); \ + CONF_CHUNK_64(part4, bucket, offset + 24, reason, conf_fn); \ + } \ +} while(0) +#else +#define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn) \ +do { \ + if (unlikely(diff256(var, ones256()))) { \ + m128 lo = movdq_lo(var); \ + m128 hi = movdq_hi(var); \ + u32 part1 = movd(lo); \ + u32 part2 = movd(rshiftbyte_m128(lo, 4)); \ + u32 part3 = movd(rshiftbyte_m128(lo, 8)); \ + u32 part4 = movd(rshiftbyte_m128(lo, 12)); \ + u32 part5 = movd(hi); \ + u32 part6 = movd(rshiftbyte_m128(hi, 4)); \ + u32 part7 = movd(rshiftbyte_m128(hi, 8)); \ + u32 part8 = movd(rshiftbyte_m128(hi, 12)); \ + CONF_CHUNK_32(part1, bucket, offset, reason, conf_fn); \ + CONF_CHUNK_32(part2, bucket, offset + 4, reason, conf_fn); \ + CONF_CHUNK_32(part3, bucket, offset + 8, reason, conf_fn); \ + CONF_CHUNK_32(part4, bucket, offset + 12, reason, conf_fn); \ + CONF_CHUNK_32(part5, bucket, offset + 16, reason, conf_fn); \ + CONF_CHUNK_32(part6, bucket, offset + 20, reason, conf_fn); \ + CONF_CHUNK_32(part7, bucket, offset + 24, reason, conf_fn); \ + CONF_CHUNK_32(part8, bucket, offset + 28, reason, conf_fn); \ + } \ +} while(0) +#endif + +#define PREP_SHUF_MASK_NO_REINFORCEMENT(val) \ + m256 lo = and256(val, *lo_mask); \ + m256 hi = and256(rshift64_m256(val, 4), *lo_mask) + +#define PREP_SHUF_MASK \ + PREP_SHUF_MASK_NO_REINFORCEMENT(load256(ptr)); \ + *c_128 = *(ptr + 15); \ + m256 r_msk = set64x4(0ULL, r_msk_base[*c_128], 0ULL, r_msk_base[*c_0]); \ + *c_0 = *(ptr + 31) + +#define SHIFT_OR_M1 \ + or256(pshufb_m256(dup_mask[0], lo), pshufb_m256(dup_mask[1], hi)) + +#define SHIFT_OR_M2 \ + or256(lshift128_m256(or256(pshufb_m256(dup_mask[2], lo), \ + pshufb_m256(dup_mask[3], hi)), \ + 1), SHIFT_OR_M1) + +#define SHIFT_OR_M3 \ + or256(lshift128_m256(or256(pshufb_m256(dup_mask[4], lo), \ + pshufb_m256(dup_mask[5], hi)), \ + 2), SHIFT_OR_M2) + +#define SHIFT_OR_M4 \ + or256(lshift128_m256(or256(pshufb_m256(dup_mask[6], lo), \ + pshufb_m256(dup_mask[7], hi)), \ + 3), SHIFT_OR_M3) + +static really_inline +m256 prep_conf_teddy_no_reinforcement_m1(const m256 *lo_mask, + const m256 *dup_mask, + const m256 val) { + PREP_SHUF_MASK_NO_REINFORCEMENT(val); + return SHIFT_OR_M1; +} + +static really_inline +m256 prep_conf_teddy_no_reinforcement_m2(const m256 *lo_mask, + const m256 *dup_mask, + const m256 val) { + PREP_SHUF_MASK_NO_REINFORCEMENT(val); + return SHIFT_OR_M2; +} + +static really_inline +m256 prep_conf_teddy_no_reinforcement_m3(const m256 *lo_mask, + const m256 *dup_mask, + const m256 val) { + PREP_SHUF_MASK_NO_REINFORCEMENT(val); + return SHIFT_OR_M3; +} + +static really_inline +m256 prep_conf_teddy_no_reinforcement_m4(const m256 *lo_mask, + const m256 *dup_mask, + const m256 val) { + PREP_SHUF_MASK_NO_REINFORCEMENT(val); + return SHIFT_OR_M4; +} + +static really_inline +m256 prep_conf_teddy_m1(const m256 *lo_mask, const m256 *dup_mask, + const u8 *ptr, const u64a *r_msk_base, + u32 *c_0, u32 *c_128) { + PREP_SHUF_MASK; + return or256(SHIFT_OR_M1, r_msk); +} + +static really_inline +m256 prep_conf_teddy_m2(const m256 *lo_mask, const m256 *dup_mask, + const u8 *ptr, const u64a *r_msk_base, + u32 *c_0, u32 *c_128) { + PREP_SHUF_MASK; + return or256(SHIFT_OR_M2, r_msk); +} + +static really_inline +m256 prep_conf_teddy_m3(const m256 *lo_mask, const m256 *dup_mask, + const u8 *ptr, const u64a *r_msk_base, + u32 *c_0, u32 *c_128) { + PREP_SHUF_MASK; + return or256(SHIFT_OR_M3, r_msk); +} + +static really_inline +m256 prep_conf_teddy_m4(const m256 *lo_mask, const m256 *dup_mask, + const u8 *ptr, const u64a *r_msk_base, + u32 *c_0, u32 *c_128) { + PREP_SHUF_MASK; + return or256(SHIFT_OR_M4, r_msk); +} + +#define PREP_CONF_FN_NO_REINFORCEMENT(val, n) \ + prep_conf_teddy_no_reinforcement_m##n(&lo_mask, dup_mask, val) + +#define PREP_CONF_FN(ptr, n) \ + prep_conf_teddy_m##n(&lo_mask, dup_mask, ptr, r_msk_base, &c_0, &c_128) + +#define PREPARE_MASKS_1 \ + dup_mask[0] = set2x128(maskBase[0]); \ + dup_mask[1] = set2x128(maskBase[1]); + +#define PREPARE_MASKS_2 \ + PREPARE_MASKS_1 \ + dup_mask[2] = set2x128(maskBase[2]); \ + dup_mask[3] = set2x128(maskBase[3]); + +#define PREPARE_MASKS_3 \ + PREPARE_MASKS_2 \ + dup_mask[4] = set2x128(maskBase[4]); \ + dup_mask[5] = set2x128(maskBase[5]); + +#define PREPARE_MASKS_4 \ + PREPARE_MASKS_3 \ + dup_mask[6] = set2x128(maskBase[6]); \ + dup_mask[7] = set2x128(maskBase[7]); + +#define PREPARE_MASKS(n) \ + m256 lo_mask = set32x8(0xf); \ + m256 dup_mask[n * 2]; \ + PREPARE_MASKS_##n + +#define FDR_EXEC_TEDDY(fdr, a, control, n_msk, conf_fn) \ +do { \ + const u8 *buf_end = a->buf + a->len; \ + const u8 *ptr = a->buf + a->start_offset; \ + u32 floodBackoff = FLOOD_BACKOFF_START; \ + const u8 *tryFloodDetect = a->firstFloodDetect; \ + u32 last_match = ones_u32; \ + const struct Teddy *teddy = (const struct Teddy *)fdr; \ + const size_t iterBytes = 64; \ + DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n", \ + a->buf, a->len, a->start_offset); \ + \ + const m128 *maskBase = getMaskBase(teddy); \ + PREPARE_MASKS(n_msk); \ + const u32 *confBase = getConfBase(teddy); \ + \ + const u64a *r_msk_base = getReinforcedMaskBase(teddy, n_msk); \ + u32 c_0 = 0x100; \ + u32 c_128 = 0x100; \ + const u8 *mainStart = ROUNDUP_PTR(ptr, 32); \ + DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart); \ + if (ptr < mainStart) { \ + ptr = mainStart - 32; \ + m256 p_mask; \ + m256 val_0 = vectoredLoad256(&p_mask, ptr, a->start_offset, \ + a->buf, buf_end, \ + a->buf_history, a->len_history, n_msk); \ + m256 r_0 = PREP_CONF_FN_NO_REINFORCEMENT(val_0, n_msk); \ + r_0 = or256(r_0, p_mask); \ + CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn); \ + ptr += 32; \ + } \ + \ + if (ptr + 32 <= buf_end) { \ + m256 r_0 = PREP_CONF_FN(ptr, n_msk); \ + CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn); \ + ptr += 32; \ + } \ + \ + for (; ptr + iterBytes <= buf_end; ptr += iterBytes) { \ + __builtin_prefetch(ptr + (iterBytes * 4)); \ + CHECK_FLOOD; \ + m256 r_0 = PREP_CONF_FN(ptr, n_msk); \ + CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, conf_fn); \ + m256 r_1 = PREP_CONF_FN(ptr + 32, n_msk); \ + CONFIRM_TEDDY(r_1, 8, 32, NOT_CAUTIOUS, conf_fn); \ + } \ + \ + if (ptr + 32 <= buf_end) { \ + m256 r_0 = PREP_CONF_FN(ptr, n_msk); \ + CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, conf_fn); \ + ptr += 32; \ + } \ + \ + assert(ptr + 32 > buf_end); \ + if (ptr < buf_end) { \ + m256 p_mask; \ + m256 val_0 = vectoredLoad256(&p_mask, ptr, 0, ptr, buf_end, \ + a->buf_history, a->len_history, n_msk); \ + m256 r_0 = PREP_CONF_FN_NO_REINFORCEMENT(val_0, n_msk); \ + r_0 = or256(r_0, p_mask); \ + CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn); \ + } \ + \ + return HWLM_SUCCESS; \ +} while(0) + +#else // not defined HAVE_AVX2 + +#ifdef ARCH_64_BIT +#define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn) \ +do { \ + if (unlikely(diff128(var, ones128()))) { \ + u64a lo = movq(var); \ + u64a hi = movq(rshiftbyte_m128(var, 8)); \ + CONF_CHUNK_64(lo, bucket, offset, reason, conf_fn); \ + CONF_CHUNK_64(hi, bucket, offset + 8, reason, conf_fn); \ + } \ +} while(0) +#else +#define CONFIRM_TEDDY(var, bucket, offset, reason, conf_fn) \ +do { \ + if (unlikely(diff128(var, ones128()))) { \ + u32 part1 = movd(var); \ + u32 part2 = movd(rshiftbyte_m128(var, 4)); \ + u32 part3 = movd(rshiftbyte_m128(var, 8)); \ + u32 part4 = movd(rshiftbyte_m128(var, 12)); \ + CONF_CHUNK_32(part1, bucket, offset, reason, conf_fn); \ + CONF_CHUNK_32(part2, bucket, offset + 4, reason, conf_fn); \ + CONF_CHUNK_32(part3, bucket, offset + 8, reason, conf_fn); \ + CONF_CHUNK_32(part4, bucket, offset + 12, reason, conf_fn); \ + } \ +} while(0) +#endif + +static really_inline +m128 prep_conf_teddy_m1(const m128 *maskBase, m128 val) { + m128 mask = set16x8(0xf); + m128 lo = and128(val, mask); + m128 hi = and128(rshift64_m128(val, 4), mask); + return or128(pshufb_m128(maskBase[0 * 2], lo), + pshufb_m128(maskBase[0 * 2 + 1], hi)); +} + +static really_inline +m128 prep_conf_teddy_m2(const m128 *maskBase, m128 *old_1, m128 val) { + m128 mask = set16x8(0xf); + m128 lo = and128(val, mask); + m128 hi = and128(rshift64_m128(val, 4), mask); + m128 r = prep_conf_teddy_m1(maskBase, val); + + m128 res_1 = or128(pshufb_m128(maskBase[1 * 2], lo), + pshufb_m128(maskBase[1 * 2 + 1], hi)); + m128 res_shifted_1 = palignr(res_1, *old_1, 16 - 1); + *old_1 = res_1; + return or128(r, res_shifted_1); +} + +static really_inline +m128 prep_conf_teddy_m3(const m128 *maskBase, m128 *old_1, m128 *old_2, + m128 val) { + m128 mask = set16x8(0xf); + m128 lo = and128(val, mask); + m128 hi = and128(rshift64_m128(val, 4), mask); + m128 r = prep_conf_teddy_m2(maskBase, old_1, val); + + m128 res_2 = or128(pshufb_m128(maskBase[2 * 2], lo), + pshufb_m128(maskBase[2 * 2 + 1], hi)); + m128 res_shifted_2 = palignr(res_2, *old_2, 16 - 2); + *old_2 = res_2; + return or128(r, res_shifted_2); +} + +static really_inline +m128 prep_conf_teddy_m4(const m128 *maskBase, m128 *old_1, m128 *old_2, + m128 *old_3, m128 val) { + m128 mask = set16x8(0xf); + m128 lo = and128(val, mask); + m128 hi = and128(rshift64_m128(val, 4), mask); + m128 r = prep_conf_teddy_m3(maskBase, old_1, old_2, val); + + m128 res_3 = or128(pshufb_m128(maskBase[3 * 2], lo), + pshufb_m128(maskBase[3 * 2 + 1], hi)); + m128 res_shifted_3 = palignr(res_3, *old_3, 16 - 3); + *old_3 = res_3; + return or128(r, res_shifted_3); +} + +#define FDR_EXEC_TEDDY_RES_OLD_1 + +#define FDR_EXEC_TEDDY_RES_OLD_2 \ + m128 res_old_1 = zeroes128(); + +#define FDR_EXEC_TEDDY_RES_OLD_3 \ + m128 res_old_1 = zeroes128(); \ + m128 res_old_2 = zeroes128(); + +#define FDR_EXEC_TEDDY_RES_OLD_4 \ + m128 res_old_1 = zeroes128(); \ + m128 res_old_2 = zeroes128(); \ + m128 res_old_3 = zeroes128(); + +#define FDR_EXEC_TEDDY_RES_OLD(n) FDR_EXEC_TEDDY_RES_OLD_##n + +#define PREP_CONF_FN_1(mask_base, val) \ + prep_conf_teddy_m1(mask_base, val) + +#define PREP_CONF_FN_2(mask_base, val) \ + prep_conf_teddy_m2(mask_base, &res_old_1, val) + +#define PREP_CONF_FN_3(mask_base, val) \ + prep_conf_teddy_m3(mask_base, &res_old_1, &res_old_2, val) + +#define PREP_CONF_FN_4(mask_base, val) \ + prep_conf_teddy_m4(mask_base, &res_old_1, &res_old_2, &res_old_3, val) + +#define PREP_CONF_FN(mask_base, val, n) \ + PREP_CONF_FN_##n(mask_base, val) + +#define FDR_EXEC_TEDDY(fdr, a, control, n_msk, conf_fn) \ +do { \ + const u8 *buf_end = a->buf + a->len; \ + const u8 *ptr = a->buf + a->start_offset; \ + u32 floodBackoff = FLOOD_BACKOFF_START; \ + const u8 *tryFloodDetect = a->firstFloodDetect; \ + u32 last_match = ones_u32; \ + const struct Teddy *teddy = (const struct Teddy *)fdr; \ + const size_t iterBytes = 32; \ + DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n", \ + a->buf, a->len, a->start_offset); \ + \ + const m128 *maskBase = getMaskBase(teddy); \ + const u32 *confBase = getConfBase(teddy); \ + \ + FDR_EXEC_TEDDY_RES_OLD(n_msk); \ + const u8 *mainStart = ROUNDUP_PTR(ptr, 16); \ + DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart); \ + if (ptr < mainStart) { \ + ptr = mainStart - 16; \ + m128 p_mask; \ + m128 val_0 = vectoredLoad128(&p_mask, ptr, a->start_offset, \ + a->buf, buf_end, \ + a->buf_history, a->len_history, n_msk); \ + m128 r_0 = PREP_CONF_FN(maskBase, val_0, n_msk); \ + r_0 = or128(r_0, p_mask); \ + CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn); \ + ptr += 16; \ + } \ + \ + if (ptr + 16 <= buf_end) { \ + m128 r_0 = PREP_CONF_FN(maskBase, load128(ptr), n_msk); \ + CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn); \ + ptr += 16; \ + } \ + \ + for (; ptr + iterBytes <= buf_end; ptr += iterBytes) { \ + __builtin_prefetch(ptr + (iterBytes * 4)); \ + CHECK_FLOOD; \ + m128 r_0 = PREP_CONF_FN(maskBase, load128(ptr), n_msk); \ + CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, conf_fn); \ + m128 r_1 = PREP_CONF_FN(maskBase, load128(ptr + 16), n_msk); \ + CONFIRM_TEDDY(r_1, 8, 16, NOT_CAUTIOUS, conf_fn); \ + } \ + \ + if (ptr + 16 <= buf_end) { \ + m128 r_0 = PREP_CONF_FN(maskBase, load128(ptr), n_msk); \ + CONFIRM_TEDDY(r_0, 8, 0, NOT_CAUTIOUS, conf_fn); \ + ptr += 16; \ + } \ + \ + assert(ptr + 16 > buf_end); \ + if (ptr < buf_end) { \ + m128 p_mask; \ + m128 val_0 = vectoredLoad128(&p_mask, ptr, 0, ptr, buf_end, \ + a->buf_history, a->len_history, n_msk); \ + m128 r_0 = PREP_CONF_FN(maskBase, val_0, n_msk); \ + r_0 = or128(r_0, p_mask); \ + CONFIRM_TEDDY(r_0, 8, 0, VECTORING, conf_fn); \ + } \ + \ + return HWLM_SUCCESS; \ +} while(0) + +#endif // HAVE_AVX2 HAVE_AVX512 + +hwlm_error_t fdr_exec_teddy_msks1(const struct FDR *fdr, + const struct FDR_Runtime_Args *a, + hwlm_group_t control) { + FDR_EXEC_TEDDY(fdr, a, control, 1, do_confWithBit_teddy); +} + +hwlm_error_t fdr_exec_teddy_msks1_pck(const struct FDR *fdr, + const struct FDR_Runtime_Args *a, + hwlm_group_t control) { + FDR_EXEC_TEDDY(fdr, a, control, 1, do_confWithBit_teddy); +} + +hwlm_error_t fdr_exec_teddy_msks2(const struct FDR *fdr, + const struct FDR_Runtime_Args *a, + hwlm_group_t control) { + FDR_EXEC_TEDDY(fdr, a, control, 2, do_confWithBit_teddy); +} + +hwlm_error_t fdr_exec_teddy_msks2_pck(const struct FDR *fdr, + const struct FDR_Runtime_Args *a, + hwlm_group_t control) { + FDR_EXEC_TEDDY(fdr, a, control, 2, do_confWithBit_teddy); +} + +hwlm_error_t fdr_exec_teddy_msks3(const struct FDR *fdr, + const struct FDR_Runtime_Args *a, + hwlm_group_t control) { + FDR_EXEC_TEDDY(fdr, a, control, 3, do_confWithBit_teddy); +} + +hwlm_error_t fdr_exec_teddy_msks3_pck(const struct FDR *fdr, + const struct FDR_Runtime_Args *a, + hwlm_group_t control) { + FDR_EXEC_TEDDY(fdr, a, control, 3, do_confWithBit_teddy); +} + +hwlm_error_t fdr_exec_teddy_msks4(const struct FDR *fdr, + const struct FDR_Runtime_Args *a, + hwlm_group_t control) { + FDR_EXEC_TEDDY(fdr, a, control, 4, do_confWithBit_teddy); +} + +hwlm_error_t fdr_exec_teddy_msks4_pck(const struct FDR *fdr, + const struct FDR_Runtime_Args *a, + hwlm_group_t control) { + FDR_EXEC_TEDDY(fdr, a, control, 4, do_confWithBit_teddy); +} diff --git a/regex/fdr/teddy.h b/regex/fdr/teddy.h new file mode 100644 index 000000000..40ae07562 --- /dev/null +++ b/regex/fdr/teddy.h @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2016-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Teddy literal matcher: function declarations. + */ + +#ifndef TEDDY_H_ +#define TEDDY_H_ + +#include "hwlm/hwlm.h" // for hwlm_group_t +#include "util/arch.h" + +struct FDR; // forward declaration from fdr_internal.h +struct FDR_Runtime_Args; + +hwlm_error_t fdr_exec_teddy_msks1(const struct FDR *fdr, + const struct FDR_Runtime_Args *a, + hwlm_group_t control); + +hwlm_error_t fdr_exec_teddy_msks1_pck(const struct FDR *fdr, + const struct FDR_Runtime_Args *a, + hwlm_group_t control); + +hwlm_error_t fdr_exec_teddy_msks2(const struct FDR *fdr, + const struct FDR_Runtime_Args *a, + hwlm_group_t control); + +hwlm_error_t fdr_exec_teddy_msks2_pck(const struct FDR *fdr, + const struct FDR_Runtime_Args *a, + hwlm_group_t control); + +hwlm_error_t fdr_exec_teddy_msks3(const struct FDR *fdr, + const struct FDR_Runtime_Args *a, + hwlm_group_t control); + +hwlm_error_t fdr_exec_teddy_msks3_pck(const struct FDR *fdr, + const struct FDR_Runtime_Args *a, + hwlm_group_t control); + +hwlm_error_t fdr_exec_teddy_msks4(const struct FDR *fdr, + const struct FDR_Runtime_Args *a, + hwlm_group_t control); + +hwlm_error_t fdr_exec_teddy_msks4_pck(const struct FDR *fdr, + const struct FDR_Runtime_Args *a, + hwlm_group_t control); + +#if defined(HAVE_AVX2) + +hwlm_error_t fdr_exec_fat_teddy_msks1(const struct FDR *fdr, + const struct FDR_Runtime_Args *a, + hwlm_group_t control); + +hwlm_error_t fdr_exec_fat_teddy_msks1_pck(const struct FDR *fdr, + const struct FDR_Runtime_Args *a, + hwlm_group_t control); + +hwlm_error_t fdr_exec_fat_teddy_msks2(const struct FDR *fdr, + const struct FDR_Runtime_Args *a, + hwlm_group_t control); + +hwlm_error_t fdr_exec_fat_teddy_msks2_pck(const struct FDR *fdr, + const struct FDR_Runtime_Args *a, + hwlm_group_t control); + +hwlm_error_t fdr_exec_fat_teddy_msks3(const struct FDR *fdr, + const struct FDR_Runtime_Args *a, + hwlm_group_t control); + +hwlm_error_t fdr_exec_fat_teddy_msks3_pck(const struct FDR *fdr, + const struct FDR_Runtime_Args *a, + hwlm_group_t control); + +hwlm_error_t fdr_exec_fat_teddy_msks4(const struct FDR *fdr, + const struct FDR_Runtime_Args *a, + hwlm_group_t control); + +hwlm_error_t fdr_exec_fat_teddy_msks4_pck(const struct FDR *fdr, + const struct FDR_Runtime_Args *a, + hwlm_group_t control); + +#endif /* HAVE_AVX2 */ + +#endif /* TEDDY_H_ */ diff --git a/regex/fdr/teddy_avx2.c b/regex/fdr/teddy_avx2.c new file mode 100644 index 000000000..6a6b27a5f --- /dev/null +++ b/regex/fdr/teddy_avx2.c @@ -0,0 +1,709 @@ +/* + * Copyright (c) 2016-2020, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Teddy literal matcher: AVX2 engine runtime. + */ + +#include "fdr_internal.h" +#include "flood_runtime.h" +#include "teddy.h" +#include "teddy_internal.h" +#include "teddy_runtime_common.h" +#include "util/arch.h" +#include "util/simd_utils.h" + +#if defined(HAVE_AVX2) + +const u8 ALIGN_AVX_DIRECTIVE p_mask_arr256[33][64] = { + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff}, + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00} +}; + +#if defined(HAVE_AVX512VBMI) // VBMI strong fat teddy + +#define CONF_FAT_CHUNK_64(chunk, bucket, off, reason, pt, conf_fn) \ +do { \ + if (unlikely(chunk != ones_u64a)) { \ + chunk = ~chunk; \ + conf_fn(&chunk, bucket, off, confBase, reason, a, pt, \ + &control, &last_match); \ + CHECK_HWLM_TERMINATE_MATCHING; \ + } \ +} while(0) + +#define CONF_FAT_CHUNK_32(chunk, bucket, off, reason, pt, conf_fn) \ +do { \ + if (unlikely(chunk != ones_u32)) { \ + chunk = ~chunk; \ + conf_fn(&chunk, bucket, off, confBase, reason, a, pt, \ + &control, &last_match); \ + CHECK_HWLM_TERMINATE_MATCHING; \ + } \ +} while(0) + +static really_inline +const m512 *getDupMaskBase(const struct Teddy *teddy, u8 numMask) { + return (const m512 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy)) + + ROUNDUP_CL(2 * numMask * sizeof(m256))); +} + +#else + +#define CONF_FAT_CHUNK_64(chunk, bucket, off, reason, conf_fn) \ +do { \ + if (unlikely(chunk != ones_u64a)) { \ + chunk = ~chunk; \ + conf_fn(&chunk, bucket, off, confBase, reason, a, ptr, \ + &control, &last_match); \ + CHECK_HWLM_TERMINATE_MATCHING; \ + } \ +} while(0) + +#define CONF_FAT_CHUNK_32(chunk, bucket, off, reason, conf_fn) \ +do { \ + if (unlikely(chunk != ones_u32)) { \ + chunk = ~chunk; \ + conf_fn(&chunk, bucket, off, confBase, reason, a, ptr, \ + &control, &last_match); \ + CHECK_HWLM_TERMINATE_MATCHING; \ + } \ +} while(0) + +static really_inline +const m256 *getMaskBase_fat(const struct Teddy *teddy) { + return (const m256 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy))); +} + +#endif + +#if defined(HAVE_AVX512VBMI) // VBMI strong fat teddy + +const u8 ALIGN_AVX_DIRECTIVE p_mask_interleave[64] = { + 0, 32, 1, 33, 2, 34, 3, 35, 4, 36, 5, 37, 6, 38, 7, 39, + 8, 40, 9, 41, 10, 42, 11, 43, 12, 44, 13, 45, 14, 46, 15, 47, + 16, 48, 17, 49, 18, 50, 19, 51, 20, 52, 21, 53, 22, 54, 23, 55, + 24, 56, 25, 57, 26, 58, 27, 59, 28, 60, 29, 61, 30, 62, 31, 63 +}; + +#ifdef ARCH_64_BIT +#define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, pt, conf_fn) \ +do { \ + if (unlikely(diff512(var, ones512()))) { \ + m512 msk_interleave = load512(p_mask_interleave); \ + m512 r = vpermb512(msk_interleave, var); \ + m128 r0 = extract128from512(r, 0); \ + m128 r1 = extract128from512(r, 1); \ + m128 r2 = extract128from512(r, 2); \ + m128 r3 = extract128from512(r, 3); \ + u64a part1 = movq(r0); \ + u64a part2 = extract64from128(r0, 1); \ + u64a part3 = movq(r1); \ + u64a part4 = extract64from128(r1, 1); \ + u64a part5 = movq(r2); \ + u64a part6 = extract64from128(r2, 1); \ + u64a part7 = movq(r3); \ + u64a part8 = extract64from128(r3, 1); \ + CONF_FAT_CHUNK_64(part1, bucket, offset, reason, pt, conf_fn); \ + CONF_FAT_CHUNK_64(part2, bucket, offset + 4, reason, pt, conf_fn); \ + CONF_FAT_CHUNK_64(part3, bucket, offset + 8, reason, pt, conf_fn); \ + CONF_FAT_CHUNK_64(part4, bucket, offset + 12, reason, pt, conf_fn); \ + CONF_FAT_CHUNK_64(part5, bucket, offset + 16, reason, pt, conf_fn); \ + CONF_FAT_CHUNK_64(part6, bucket, offset + 20, reason, pt, conf_fn); \ + CONF_FAT_CHUNK_64(part7, bucket, offset + 24, reason, pt, conf_fn); \ + CONF_FAT_CHUNK_64(part8, bucket, offset + 28, reason, pt, conf_fn); \ + } \ +} while(0) +#else +#define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, pt, conf_fn) \ +do { \ + if (unlikely(diff512(var, ones512()))) { \ + m512 msk_interleave = load512(p_mask_interleave); \ + m512 r = vpermb512(msk_interleave, var); \ + m128 r0 = extract128from512(r, 0); \ + m128 r1 = extract128from512(r, 1); \ + m128 r2 = extract128from512(r, 2); \ + m128 r3 = extract128from512(r, 3); \ + u32 part1 = movd(r0); \ + u32 part2 = extract32from128(r0, 1); \ + u32 part3 = extract32from128(r0, 2); \ + u32 part4 = extract32from128(r0, 3); \ + u32 part5 = movd(r1); \ + u32 part6 = extract32from128(r1, 1); \ + u32 part7 = extract32from128(r1, 2); \ + u32 part8 = extract32from128(r1, 3); \ + u32 part9 = movd(r2); \ + u32 part10 = extract32from128(r2, 1); \ + u32 part11 = extract32from128(r2, 2); \ + u32 part12 = extract32from128(r2, 3); \ + u32 part13 = movd(r3); \ + u32 part14 = extract32from128(r3, 1); \ + u32 part15 = extract32from128(r3, 2); \ + u32 part16 = extract32from128(r3, 3); \ + CONF_FAT_CHUNK_32(part1, bucket, offset, reason, pt, conf_fn); \ + CONF_FAT_CHUNK_32(part2, bucket, offset + 2, reason, pt, conf_fn); \ + CONF_FAT_CHUNK_32(part3, bucket, offset + 4, reason, pt, conf_fn); \ + CONF_FAT_CHUNK_32(part4, bucket, offset + 6, reason, pt, conf_fn); \ + CONF_FAT_CHUNK_32(part5, bucket, offset + 8, reason, pt, conf_fn); \ + CONF_FAT_CHUNK_32(part6, bucket, offset + 10, reason, pt, conf_fn); \ + CONF_FAT_CHUNK_32(part7, bucket, offset + 12, reason, pt, conf_fn); \ + CONF_FAT_CHUNK_32(part8, bucket, offset + 14, reason, pt, conf_fn); \ + CONF_FAT_CHUNK_32(part9, bucket, offset + 16, reason, pt, conf_fn); \ + CONF_FAT_CHUNK_32(part10, bucket, offset + 18, reason, pt, conf_fn);\ + CONF_FAT_CHUNK_32(part11, bucket, offset + 20, reason, pt, conf_fn);\ + CONF_FAT_CHUNK_32(part12, bucket, offset + 22, reason, pt, conf_fn);\ + CONF_FAT_CHUNK_32(part13, bucket, offset + 24, reason, pt, conf_fn);\ + CONF_FAT_CHUNK_32(part14, bucket, offset + 26, reason, pt, conf_fn);\ + CONF_FAT_CHUNK_32(part15, bucket, offset + 28, reason, pt, conf_fn);\ + CONF_FAT_CHUNK_32(part16, bucket, offset + 30, reason, pt, conf_fn);\ + } \ +} while(0) +#endif + +#define PREP_FAT_SHUF_MASK \ + m512 lo = and512(val, *lo_mask); \ + m512 hi = and512(rshift64_m512(val, 4), *lo_mask) + +#define FAT_TEDDY_VBMI_PSHUFB_OR_M1 \ + m512 shuf_or_b0 = or512(pshufb_m512(dup_mask[0], lo), \ + pshufb_m512(dup_mask[1], hi)); + +#define FAT_TEDDY_VBMI_PSHUFB_OR_M2 \ + FAT_TEDDY_VBMI_PSHUFB_OR_M1 \ + m512 shuf_or_b1 = or512(pshufb_m512(dup_mask[2], lo), \ + pshufb_m512(dup_mask[3], hi)); + +#define FAT_TEDDY_VBMI_PSHUFB_OR_M3 \ + FAT_TEDDY_VBMI_PSHUFB_OR_M2 \ + m512 shuf_or_b2 = or512(pshufb_m512(dup_mask[4], lo), \ + pshufb_m512(dup_mask[5], hi)); + +#define FAT_TEDDY_VBMI_PSHUFB_OR_M4 \ + FAT_TEDDY_VBMI_PSHUFB_OR_M3 \ + m512 shuf_or_b3 = or512(pshufb_m512(dup_mask[6], lo), \ + pshufb_m512(dup_mask[7], hi)); + +#define FAT_TEDDY_VBMI_SL1_MASK 0xfffffffefffffffeULL +#define FAT_TEDDY_VBMI_SL2_MASK 0xfffffffcfffffffcULL +#define FAT_TEDDY_VBMI_SL3_MASK 0xfffffff8fffffff8ULL + +#define FAT_TEDDY_VBMI_SHIFT_M1 + +#define FAT_TEDDY_VBMI_SHIFT_M2 \ + FAT_TEDDY_VBMI_SHIFT_M1 \ + m512 sl1 = maskz_vpermb512(FAT_TEDDY_VBMI_SL1_MASK, sl_msk[0], shuf_or_b1); + +#define FAT_TEDDY_VBMI_SHIFT_M3 \ + FAT_TEDDY_VBMI_SHIFT_M2 \ + m512 sl2 = maskz_vpermb512(FAT_TEDDY_VBMI_SL2_MASK, sl_msk[1], shuf_or_b2); + +#define FAT_TEDDY_VBMI_SHIFT_M4 \ + FAT_TEDDY_VBMI_SHIFT_M3 \ + m512 sl3 = maskz_vpermb512(FAT_TEDDY_VBMI_SL3_MASK, sl_msk[2], shuf_or_b3); + +#define FAT_SHIFT_OR_M1 \ + shuf_or_b0 + +#define FAT_SHIFT_OR_M2 \ + or512(sl1, FAT_SHIFT_OR_M1) + +#define FAT_SHIFT_OR_M3 \ + or512(sl2, FAT_SHIFT_OR_M2) + +#define FAT_SHIFT_OR_M4 \ + or512(sl3, FAT_SHIFT_OR_M3) + +static really_inline +m512 prep_conf_fat_teddy_m1(const m512 *lo_mask, const m512 *dup_mask, + UNUSED const m512 *sl_msk, const m512 val) { + PREP_FAT_SHUF_MASK; + FAT_TEDDY_VBMI_PSHUFB_OR_M1; + FAT_TEDDY_VBMI_SHIFT_M1; + return FAT_SHIFT_OR_M1; +} + +static really_inline +m512 prep_conf_fat_teddy_m2(const m512 *lo_mask, const m512 *dup_mask, + const m512 *sl_msk, const m512 val) { + PREP_FAT_SHUF_MASK; + FAT_TEDDY_VBMI_PSHUFB_OR_M2; + FAT_TEDDY_VBMI_SHIFT_M2; + return FAT_SHIFT_OR_M2; +} + +static really_inline +m512 prep_conf_fat_teddy_m3(const m512 *lo_mask, const m512 *dup_mask, + const m512 *sl_msk, const m512 val) { + PREP_FAT_SHUF_MASK; + FAT_TEDDY_VBMI_PSHUFB_OR_M3; + FAT_TEDDY_VBMI_SHIFT_M3; + return FAT_SHIFT_OR_M3; +} + +static really_inline +m512 prep_conf_fat_teddy_m4(const m512 *lo_mask, const m512 *dup_mask, + const m512 *sl_msk, const m512 val) { + PREP_FAT_SHUF_MASK; + FAT_TEDDY_VBMI_PSHUFB_OR_M4; + FAT_TEDDY_VBMI_SHIFT_M4; + return FAT_SHIFT_OR_M4; +} + +#define PREP_CONF_FAT_FN(val, n) \ + prep_conf_fat_teddy_m##n(&lo_mask, dup_mask, sl_msk, val) + +#define FAT_TEDDY_VBMI_SL1_POS 15 +#define FAT_TEDDY_VBMI_SL2_POS 14 +#define FAT_TEDDY_VBMI_SL3_POS 13 + +#define FAT_TEDDY_VBMI_LOAD_SHIFT_MASK_M1 + +#define FAT_TEDDY_VBMI_LOAD_SHIFT_MASK_M2 \ + FAT_TEDDY_VBMI_LOAD_SHIFT_MASK_M1 \ + sl_msk[0] = loadu512(p_sh_mask_arr + FAT_TEDDY_VBMI_SL1_POS); + +#define FAT_TEDDY_VBMI_LOAD_SHIFT_MASK_M3 \ + FAT_TEDDY_VBMI_LOAD_SHIFT_MASK_M2 \ + sl_msk[1] = loadu512(p_sh_mask_arr + FAT_TEDDY_VBMI_SL2_POS); + +#define FAT_TEDDY_VBMI_LOAD_SHIFT_MASK_M4 \ + FAT_TEDDY_VBMI_LOAD_SHIFT_MASK_M3 \ + sl_msk[2] = loadu512(p_sh_mask_arr + FAT_TEDDY_VBMI_SL3_POS); + +/* + * In FAT teddy, it needs 2 bytes to represent result of each position, + * so each nibble's(for example, lo nibble of last byte) FAT teddy mask + * has 16x2 bytes: + * |----------------------------------|----------------------------------| + * 16bytes (bucket 0..7 in each byte) 16bytes (bucket 8..15 in each byte) + * A B + * at runtime FAT teddy reads 16 bytes once and duplicate them to 32 bytes: + * |----------------------------------|----------------------------------| + * 16bytes input data (lo nibbles) 16bytes duplicated data (lo nibbles) + * X X + * then do pshufb_m256(AB, XX). + * + * In AVX512 reinforced FAT teddy, it reads 32 bytes once and duplicate them + * to 64 bytes: + * |----------------|----------------|----------------|----------------| + * X Y X Y + * in this case we need DUP_FAT_MASK to construct AABB: + * |----------------|----------------|----------------|----------------| + * A A B B + * then do pshufb_m512(AABB, XYXY). + */ + +#define PREPARE_FAT_MASKS(n) \ + m512 lo_mask = set64x8(0xf); \ + m512 sl_msk[n - 1]; \ + FAT_TEDDY_VBMI_LOAD_SHIFT_MASK_M##n + +#define FAT_TEDDY_VBMI_CONF_MASK_HEAD (0xffffffffULL >> n_sh) +#define FAT_TEDDY_VBMI_CONF_MASK_FULL ((0xffffffffULL << n_sh) & 0xffffffffULL) +#define FAT_TEDDY_VBMI_CONF_MASK_VAR(n) (0xffffffffULL >> (32 - n) << overlap) +#define FAT_TEDDY_VBMI_LOAD_MASK_PATCH (0xffffffffULL >> (32 - n_sh)) + +#define FDR_EXEC_FAT_TEDDY(fdr, a, control, n_msk, conf_fn) \ +do { \ + const u8 *buf_end = a->buf + a->len; \ + const u8 *ptr = a->buf + a->start_offset; \ + u32 floodBackoff = FLOOD_BACKOFF_START; \ + const u8 *tryFloodDetect = a->firstFloodDetect; \ + u32 last_match = ones_u32; \ + const struct Teddy *teddy = (const struct Teddy *)fdr; \ + const size_t iterBytes = 32; \ + u32 n_sh = n_msk - 1; \ + const size_t loopBytes = 32 - n_sh; \ + DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n", \ + a->buf, a->len, a->start_offset); \ + \ + const m512 *dup_mask = getDupMaskBase(teddy, n_msk); \ + PREPARE_FAT_MASKS(n_msk); \ + const u32 *confBase = getConfBase(teddy); \ + \ + u64a k = FAT_TEDDY_VBMI_CONF_MASK_FULL; \ + m512 p_mask = set_mask_m512(~((k << 32) | k)); \ + u32 overlap = 0; \ + u64a patch = 0; \ + if (likely(ptr + loopBytes <= buf_end)) { \ + u64a k0 = FAT_TEDDY_VBMI_CONF_MASK_HEAD; \ + m512 p_mask0 = set_mask_m512(~((k0 << 32) | k0)); \ + m512 r_0 = PREP_CONF_FAT_FN(set2x256(loadu256(ptr)), n_msk); \ + r_0 = or512(r_0, p_mask0); \ + CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, ptr, conf_fn); \ + ptr += loopBytes; \ + overlap = n_sh; \ + patch = FAT_TEDDY_VBMI_LOAD_MASK_PATCH; \ + } \ + \ + for (; ptr + loopBytes <= buf_end; ptr += loopBytes) { \ + CHECK_FLOOD; \ + m512 r_0 = PREP_CONF_FAT_FN(set2x256(loadu256(ptr - n_sh)), n_msk); \ + r_0 = or512(r_0, p_mask); \ + CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, ptr - n_sh, conf_fn); \ + } \ + \ + assert(ptr + loopBytes > buf_end); \ + if (ptr < buf_end) { \ + u32 left = (u32)(buf_end - ptr); \ + u64a k1 = FAT_TEDDY_VBMI_CONF_MASK_VAR(left); \ + m512 p_mask1 = set_mask_m512(~((k1 << 32) | k1)); \ + m512 val_0 = set2x256(loadu_maskz_m256(k1 | patch, ptr - overlap)); \ + m512 r_0 = PREP_CONF_FAT_FN(val_0, n_msk); \ + r_0 = or512(r_0, p_mask1); \ + CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, ptr - overlap, conf_fn); \ + } \ + \ + return HWLM_SUCCESS; \ +} while(0) + +#else // !HAVE_AVX512VBMI, AVX2 normal fat teddy + +#ifdef ARCH_64_BIT +#define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, conf_fn) \ +do { \ + if (unlikely(diff256(var, ones256()))) { \ + m256 swap = swap128in256(var); \ + m256 r = interleave256lo(var, swap); \ + u64a part1 = extractlow64from256(r); \ + u64a part2 = extract64from256(r, 1); \ + r = interleave256hi(var, swap); \ + u64a part3 = extractlow64from256(r); \ + u64a part4 = extract64from256(r, 1); \ + CONF_FAT_CHUNK_64(part1, bucket, offset, reason, conf_fn); \ + CONF_FAT_CHUNK_64(part2, bucket, offset + 4, reason, conf_fn); \ + CONF_FAT_CHUNK_64(part3, bucket, offset + 8, reason, conf_fn); \ + CONF_FAT_CHUNK_64(part4, bucket, offset + 12, reason, conf_fn); \ + } \ +} while(0) +#else +#define CONFIRM_FAT_TEDDY(var, bucket, offset, reason, conf_fn) \ +do { \ + if (unlikely(diff256(var, ones256()))) { \ + m256 swap = swap128in256(var); \ + m256 r = interleave256lo(var, swap); \ + u32 part1 = extractlow32from256(r); \ + u32 part2 = extract32from256(r, 1); \ + u32 part3 = extract32from256(r, 2); \ + u32 part4 = extract32from256(r, 3); \ + r = interleave256hi(var, swap); \ + u32 part5 = extractlow32from256(r); \ + u32 part6 = extract32from256(r, 1); \ + u32 part7 = extract32from256(r, 2); \ + u32 part8 = extract32from256(r, 3); \ + CONF_FAT_CHUNK_32(part1, bucket, offset, reason, conf_fn); \ + CONF_FAT_CHUNK_32(part2, bucket, offset + 2, reason, conf_fn); \ + CONF_FAT_CHUNK_32(part3, bucket, offset + 4, reason, conf_fn); \ + CONF_FAT_CHUNK_32(part4, bucket, offset + 6, reason, conf_fn); \ + CONF_FAT_CHUNK_32(part5, bucket, offset + 8, reason, conf_fn); \ + CONF_FAT_CHUNK_32(part6, bucket, offset + 10, reason, conf_fn); \ + CONF_FAT_CHUNK_32(part7, bucket, offset + 12, reason, conf_fn); \ + CONF_FAT_CHUNK_32(part8, bucket, offset + 14, reason, conf_fn); \ + } \ +} while(0) +#endif + +static really_inline +m256 vectoredLoad2x128(m256 *p_mask, const u8 *ptr, const size_t start_offset, + const u8 *lo, const u8 *hi, + const u8 *buf_history, size_t len_history, + const u32 nMasks) { + m128 p_mask128; + m256 ret = set2x128(vectoredLoad128(&p_mask128, ptr, start_offset, lo, hi, + buf_history, len_history, nMasks)); + *p_mask = set2x128(p_mask128); + return ret; +} + +static really_inline +m256 prep_conf_fat_teddy_m1(const m256 *maskBase, m256 val) { + m256 mask = set32x8(0xf); + m256 lo = and256(val, mask); + m256 hi = and256(rshift64_m256(val, 4), mask); + return or256(pshufb_m256(maskBase[0 * 2], lo), + pshufb_m256(maskBase[0 * 2 + 1], hi)); +} + +static really_inline +m256 prep_conf_fat_teddy_m2(const m256 *maskBase, m256 *old_1, m256 val) { + m256 mask = set32x8(0xf); + m256 lo = and256(val, mask); + m256 hi = and256(rshift64_m256(val, 4), mask); + m256 r = prep_conf_fat_teddy_m1(maskBase, val); + + m256 res_1 = or256(pshufb_m256(maskBase[1 * 2], lo), + pshufb_m256(maskBase[1 * 2 + 1], hi)); + m256 res_shifted_1 = vpalignr(res_1, *old_1, 16 - 1); + *old_1 = res_1; + return or256(r, res_shifted_1); +} + +static really_inline +m256 prep_conf_fat_teddy_m3(const m256 *maskBase, m256 *old_1, m256 *old_2, + m256 val) { + m256 mask = set32x8(0xf); + m256 lo = and256(val, mask); + m256 hi = and256(rshift64_m256(val, 4), mask); + m256 r = prep_conf_fat_teddy_m2(maskBase, old_1, val); + + m256 res_2 = or256(pshufb_m256(maskBase[2 * 2], lo), + pshufb_m256(maskBase[2 * 2 + 1], hi)); + m256 res_shifted_2 = vpalignr(res_2, *old_2, 16 - 2); + *old_2 = res_2; + return or256(r, res_shifted_2); +} + +static really_inline +m256 prep_conf_fat_teddy_m4(const m256 *maskBase, m256 *old_1, m256 *old_2, + m256 *old_3, m256 val) { + m256 mask = set32x8(0xf); + m256 lo = and256(val, mask); + m256 hi = and256(rshift64_m256(val, 4), mask); + m256 r = prep_conf_fat_teddy_m3(maskBase, old_1, old_2, val); + + m256 res_3 = or256(pshufb_m256(maskBase[3 * 2], lo), + pshufb_m256(maskBase[3 * 2 + 1], hi)); + m256 res_shifted_3 = vpalignr(res_3, *old_3, 16 - 3); + *old_3 = res_3; + return or256(r, res_shifted_3); +} + +#define FDR_EXEC_FAT_TEDDY_RES_OLD_1 \ +do { \ +} while(0) + +#define FDR_EXEC_FAT_TEDDY_RES_OLD_2 \ + m256 res_old_1 = zeroes256(); + +#define FDR_EXEC_FAT_TEDDY_RES_OLD_3 \ + m256 res_old_1 = zeroes256(); \ + m256 res_old_2 = zeroes256(); + +#define FDR_EXEC_FAT_TEDDY_RES_OLD_4 \ + m256 res_old_1 = zeroes256(); \ + m256 res_old_2 = zeroes256(); \ + m256 res_old_3 = zeroes256(); + +#define FDR_EXEC_FAT_TEDDY_RES_OLD(n) FDR_EXEC_FAT_TEDDY_RES_OLD_##n + +#define PREP_CONF_FAT_FN_1(mask_base, val) \ + prep_conf_fat_teddy_m1(mask_base, val) + +#define PREP_CONF_FAT_FN_2(mask_base, val) \ + prep_conf_fat_teddy_m2(mask_base, &res_old_1, val) + +#define PREP_CONF_FAT_FN_3(mask_base, val) \ + prep_conf_fat_teddy_m3(mask_base, &res_old_1, &res_old_2, val) + +#define PREP_CONF_FAT_FN_4(mask_base, val) \ + prep_conf_fat_teddy_m4(mask_base, &res_old_1, &res_old_2, &res_old_3, val) + +#define PREP_CONF_FAT_FN(mask_base, val, n) \ + PREP_CONF_FAT_FN_##n(mask_base, val) + +#define FDR_EXEC_FAT_TEDDY(fdr, a, control, n_msk, conf_fn) \ +do { \ + const u8 *buf_end = a->buf + a->len; \ + const u8 *ptr = a->buf + a->start_offset; \ + u32 floodBackoff = FLOOD_BACKOFF_START; \ + const u8 *tryFloodDetect = a->firstFloodDetect; \ + u32 last_match = ones_u32; \ + const struct Teddy *teddy = (const struct Teddy *)fdr; \ + const size_t iterBytes = 32; \ + DEBUG_PRINTF("params: buf %p len %zu start_offset %zu\n", \ + a->buf, a->len, a->start_offset); \ + \ + const m256 *maskBase = getMaskBase_fat(teddy); \ + const u32 *confBase = getConfBase(teddy); \ + \ + FDR_EXEC_FAT_TEDDY_RES_OLD(n_msk); \ + const u8 *mainStart = ROUNDUP_PTR(ptr, 16); \ + DEBUG_PRINTF("derive: ptr: %p mainstart %p\n", ptr, mainStart); \ + if (ptr < mainStart) { \ + ptr = mainStart - 16; \ + m256 p_mask; \ + m256 val_0 = vectoredLoad2x128(&p_mask, ptr, a->start_offset, \ + a->buf, buf_end, \ + a->buf_history, a->len_history, \ + n_msk); \ + m256 r_0 = PREP_CONF_FAT_FN(maskBase, val_0, n_msk); \ + r_0 = or256(r_0, p_mask); \ + CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, conf_fn); \ + ptr += 16; \ + } \ + \ + if (ptr + 16 <= buf_end) { \ + m256 r_0 = PREP_CONF_FAT_FN(maskBase, load2x128(ptr), n_msk); \ + CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, conf_fn); \ + ptr += 16; \ + } \ + \ + for ( ; ptr + iterBytes <= buf_end; ptr += iterBytes) { \ + __builtin_prefetch(ptr + (iterBytes * 4)); \ + CHECK_FLOOD; \ + m256 r_0 = PREP_CONF_FAT_FN(maskBase, load2x128(ptr), n_msk); \ + CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, conf_fn); \ + m256 r_1 = PREP_CONF_FAT_FN(maskBase, load2x128(ptr + 16), n_msk); \ + CONFIRM_FAT_TEDDY(r_1, 16, 16, NOT_CAUTIOUS, conf_fn); \ + } \ + \ + if (ptr + 16 <= buf_end) { \ + m256 r_0 = PREP_CONF_FAT_FN(maskBase, load2x128(ptr), n_msk); \ + CONFIRM_FAT_TEDDY(r_0, 16, 0, NOT_CAUTIOUS, conf_fn); \ + ptr += 16; \ + } \ + \ + assert(ptr + 16 > buf_end); \ + if (ptr < buf_end) { \ + m256 p_mask; \ + m256 val_0 = vectoredLoad2x128(&p_mask, ptr, 0, ptr, buf_end, \ + a->buf_history, a->len_history, \ + n_msk); \ + m256 r_0 = PREP_CONF_FAT_FN(maskBase, val_0, n_msk); \ + r_0 = or256(r_0, p_mask); \ + CONFIRM_FAT_TEDDY(r_0, 16, 0, VECTORING, conf_fn); \ + } \ + \ + return HWLM_SUCCESS; \ +} while(0) + +#endif // HAVE_AVX512VBMI + +hwlm_error_t fdr_exec_fat_teddy_msks1(const struct FDR *fdr, + const struct FDR_Runtime_Args *a, + hwlm_group_t control) { + FDR_EXEC_FAT_TEDDY(fdr, a, control, 1, do_confWithBit_teddy); +} + +hwlm_error_t fdr_exec_fat_teddy_msks1_pck(const struct FDR *fdr, + const struct FDR_Runtime_Args *a, + hwlm_group_t control) { + FDR_EXEC_FAT_TEDDY(fdr, a, control, 1, do_confWithBit_teddy); +} + +hwlm_error_t fdr_exec_fat_teddy_msks2(const struct FDR *fdr, + const struct FDR_Runtime_Args *a, + hwlm_group_t control) { + FDR_EXEC_FAT_TEDDY(fdr, a, control, 2, do_confWithBit_teddy); +} + +hwlm_error_t fdr_exec_fat_teddy_msks2_pck(const struct FDR *fdr, + const struct FDR_Runtime_Args *a, + hwlm_group_t control) { + FDR_EXEC_FAT_TEDDY(fdr, a, control, 2, do_confWithBit_teddy); +} + +hwlm_error_t fdr_exec_fat_teddy_msks3(const struct FDR *fdr, + const struct FDR_Runtime_Args *a, + hwlm_group_t control) { + FDR_EXEC_FAT_TEDDY(fdr, a, control, 3, do_confWithBit_teddy); +} + +hwlm_error_t fdr_exec_fat_teddy_msks3_pck(const struct FDR *fdr, + const struct FDR_Runtime_Args *a, + hwlm_group_t control) { + FDR_EXEC_FAT_TEDDY(fdr, a, control, 3, do_confWithBit_teddy); +} + +hwlm_error_t fdr_exec_fat_teddy_msks4(const struct FDR *fdr, + const struct FDR_Runtime_Args *a, + hwlm_group_t control) { + FDR_EXEC_FAT_TEDDY(fdr, a, control, 4, do_confWithBit_teddy); +} + +hwlm_error_t fdr_exec_fat_teddy_msks4_pck(const struct FDR *fdr, + const struct FDR_Runtime_Args *a, + hwlm_group_t control) { + FDR_EXEC_FAT_TEDDY(fdr, a, control, 4, do_confWithBit_teddy); +} + +#endif // HAVE_AVX2 diff --git a/regex/fdr/teddy_internal.h b/regex/fdr/teddy_internal.h new file mode 100644 index 000000000..1e9e603fa --- /dev/null +++ b/regex/fdr/teddy_internal.h @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* Teddy bytecode layout: + * * |-----| + * * | | struct Teddy + * * |-----| + * * | | teddy masks + * * | | + * * |-----| + * * | | reinforcement mask table for bucket 0..7 + * * | | + * * |-----| + * * | | reinforcement mask table for bucket 8..15 (FAT teddy) + * * | | + * * |-----| + * * | | confirm + * * | | + * * | | + * * |-----| + * * | | flood control + * * | | + * * |-----| + */ + +#ifndef TEDDY_INTERNAL_H +#define TEDDY_INTERNAL_H + +#include "ue2common.h" + +// first part is compatible with an FDR +struct Teddy { + u32 engineID; + u32 size; + u32 maxStringLen; + u32 numStrings; + u32 confOffset; + u32 floodOffset; +}; + +#endif diff --git a/regex/fdr/teddy_runtime_common.h b/regex/fdr/teddy_runtime_common.h new file mode 100644 index 000000000..b76800eb0 --- /dev/null +++ b/regex/fdr/teddy_runtime_common.h @@ -0,0 +1,459 @@ +/* + * Copyright (c) 2016-2020, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Teddy literal matcher: common runtime procedures. + */ + +#ifndef TEDDY_RUNTIME_COMMON_H_ +#define TEDDY_RUNTIME_COMMON_H_ + +#include "fdr_confirm.h" +#include "fdr_confirm_runtime.h" +#include "ue2common.h" +#include "util/bitutils.h" +#include "util/simd_utils.h" +#include "util/uniform_ops.h" + +extern const u8 ALIGN_DIRECTIVE p_mask_arr[17][32]; +#if defined(HAVE_AVX2) +extern const u8 ALIGN_AVX_DIRECTIVE p_mask_arr256[33][64]; +#endif + +#if defined(HAVE_AVX512VBMI) +static const u8 ALIGN_DIRECTIVE p_sh_mask_arr[80] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f +}; +#endif + +#ifdef ARCH_64_BIT +#define TEDDY_CONF_TYPE u64a +#define TEDDY_FIND_AND_CLEAR_LSB(conf) findAndClearLSB_64(conf) +#else +#define TEDDY_CONF_TYPE u32 +#define TEDDY_FIND_AND_CLEAR_LSB(conf) findAndClearLSB_32(conf) +#endif + +#define CHECK_HWLM_TERMINATE_MATCHING \ +do { \ + if (unlikely(control == HWLM_TERMINATE_MATCHING)) { \ + return HWLM_TERMINATED; \ + } \ +} while (0); + +#define CHECK_FLOOD \ +do { \ + if (unlikely(ptr > tryFloodDetect)) { \ + tryFloodDetect = floodDetect(fdr, a, &ptr, tryFloodDetect, \ + &floodBackoff, &control, iterBytes); \ + CHECK_HWLM_TERMINATE_MATCHING; \ + } \ +} while (0); + +/* + * \brief Copy a block of [0,15] bytes efficiently. + * + * This function is a workaround intended to stop some compilers from + * synthesizing a memcpy function call out of the copy of a small number of + * bytes that we do in vectoredLoad128. + */ +static really_inline +void copyRuntBlock128(u8 *dst, const u8 *src, size_t len) { + switch (len) { + case 0: + break; + case 1: + *dst = *src; + break; + case 2: + unaligned_store_u16(dst, unaligned_load_u16(src)); + break; + case 3: + unaligned_store_u16(dst, unaligned_load_u16(src)); + dst[2] = src[2]; + break; + case 4: + unaligned_store_u32(dst, unaligned_load_u32(src)); + break; + case 5: + case 6: + case 7: + /* Perform copy with two overlapping 4-byte chunks. */ + unaligned_store_u32(dst + len - 4, unaligned_load_u32(src + len - 4)); + unaligned_store_u32(dst, unaligned_load_u32(src)); + break; + case 8: + unaligned_store_u64a(dst, unaligned_load_u64a(src)); + break; + default: + /* Perform copy with two overlapping 8-byte chunks. */ + assert(len < 16); + unaligned_store_u64a(dst + len - 8, unaligned_load_u64a(src + len - 8)); + unaligned_store_u64a(dst, unaligned_load_u64a(src)); + break; + } +} + +// Note: p_mask is an output param that initialises a poison mask. +// *p_mask = load128(p_mask_arr[n] + 16 - m) means: +// m byte 0xff in the beginning, followed by n byte 0x00, +// then followed by the rest bytes 0xff. +// ptr >= lo: +// no history. +// for end/short zone, ptr==lo and start_offset==0 +// for start zone, see below +// lo ptr hi hi +// |----------|-------|----------------|............| +// -start 0 -start+offset MIN(avail,16) +// p_mask ffff..ff0000...........00ffff.......... +// ptr < lo: +// only start zone. +// history +// ptr lo hi hi +// |----------|-------|----------------|............| +// 0 start start+offset end(<=16) +// p_mask ffff.....ffffff..ff0000...........00ffff.......... +static really_inline +m128 vectoredLoad128(m128 *p_mask, const u8 *ptr, const size_t start_offset, + const u8 *lo, const u8 *hi, + const u8 *buf_history, size_t len_history, + const u32 nMasks) { + union { + u8 val8[16]; + m128 val128; + } u; + u.val128 = zeroes128(); + + uintptr_t copy_start; + uintptr_t copy_len; + + if (ptr >= lo) { // short/end/start zone + uintptr_t start = (uintptr_t)(ptr - lo); + uintptr_t avail = (uintptr_t)(hi - ptr); + if (avail >= 16) { + assert(start_offset - start <= 16); + *p_mask = loadu128(p_mask_arr[16 - start_offset + start] + + 16 - start_offset + start); + return loadu128(ptr); + } + assert(start_offset - start <= avail); + *p_mask = loadu128(p_mask_arr[avail - start_offset + start] + + 16 - start_offset + start); + copy_start = 0; + copy_len = avail; + } else { // start zone + uintptr_t need = MIN((uintptr_t)(lo - ptr), + MIN(len_history, nMasks - 1)); + uintptr_t start = (uintptr_t)(lo - ptr); + uintptr_t i; + for (i = start - need; i < start; i++) { + u.val8[i] = buf_history[len_history - (start - i)]; + } + uintptr_t end = MIN(16, (uintptr_t)(hi - ptr)); + assert(start + start_offset <= end); + *p_mask = loadu128(p_mask_arr[end - start - start_offset] + + 16 - start - start_offset); + copy_start = start; + copy_len = end - start; + } + + // Runt block from the buffer. + copyRuntBlock128(&u.val8[copy_start], &ptr[copy_start], copy_len); + + return u.val128; +} + +#if defined(HAVE_AVX2) +/* + * \brief Copy a block of [0,31] bytes efficiently. + * + * This function is a workaround intended to stop some compilers from + * synthesizing a memcpy function call out of the copy of a small number of + * bytes that we do in vectoredLoad256. + */ +static really_inline +void copyRuntBlock256(u8 *dst, const u8 *src, size_t len) { + switch (len) { + case 0: + break; + case 1: + *dst = *src; + break; + case 2: + unaligned_store_u16(dst, unaligned_load_u16(src)); + break; + case 3: + unaligned_store_u16(dst, unaligned_load_u16(src)); + dst[2] = src[2]; + break; + case 4: + unaligned_store_u32(dst, unaligned_load_u32(src)); + break; + case 5: + case 6: + case 7: + /* Perform copy with two overlapping 4-byte chunks. */ + unaligned_store_u32(dst + len - 4, unaligned_load_u32(src + len - 4)); + unaligned_store_u32(dst, unaligned_load_u32(src)); + break; + case 8: + unaligned_store_u64a(dst, unaligned_load_u64a(src)); + break; + case 9: + case 10: + case 11: + case 12: + case 13: + case 14: + case 15: + /* Perform copy with two overlapping 8-byte chunks. */ + unaligned_store_u64a(dst + len - 8, unaligned_load_u64a(src + len - 8)); + unaligned_store_u64a(dst, unaligned_load_u64a(src)); + break; + case 16: + storeu128(dst, loadu128(src)); + break; + default: + /* Perform copy with two overlapping 16-byte chunks. */ + assert(len < 32); + storeu128(dst + len - 16, loadu128(src + len - 16)); + storeu128(dst, loadu128(src)); + break; + } +} + +// Note: p_mask is an output param that initialises a poison mask. +// *p_mask = load256(p_mask_arr256[n] + 32 - m) means: +// m byte 0xff in the beginning, followed by n byte 0x00, +// then followed by the rest bytes 0xff. +// ptr >= lo: +// no history. +// for end/short zone, ptr==lo and start_offset==0 +// for start zone, see below +// lo ptr hi hi +// |----------|-------|----------------|............| +// -start 0 -start+offset MIN(avail,32) +// p_mask ffff..ff0000...........00ffff.......... +// ptr < lo: +// only start zone. +// history +// ptr lo hi hi +// |----------|-------|----------------|............| +// 0 start start+offset end(<=32) +// p_mask ffff.....ffffff..ff0000...........00ffff.......... +static really_inline +m256 vectoredLoad256(m256 *p_mask, const u8 *ptr, const size_t start_offset, + const u8 *lo, const u8 *hi, + const u8 *buf_history, size_t len_history, + const u32 nMasks) { + union { + u8 val8[32]; + m256 val256; + } u; + u.val256 = zeroes256(); + + uintptr_t copy_start; + uintptr_t copy_len; + + if (ptr >= lo) { // short/end/start zone + uintptr_t start = (uintptr_t)(ptr - lo); + uintptr_t avail = (uintptr_t)(hi - ptr); + if (avail >= 32) { + assert(start_offset - start <= 32); + *p_mask = loadu256(p_mask_arr256[32 - start_offset + start] + + 32 - start_offset + start); + return loadu256(ptr); + } + assert(start_offset - start <= avail); + *p_mask = loadu256(p_mask_arr256[avail - start_offset + start] + + 32 - start_offset + start); + copy_start = 0; + copy_len = avail; + } else { //start zone + uintptr_t need = MIN((uintptr_t)(lo - ptr), + MIN(len_history, nMasks - 1)); + uintptr_t start = (uintptr_t)(lo - ptr); + uintptr_t i; + for (i = start - need; i < start; i++) { + u.val8[i] = buf_history[len_history - (start - i)]; + } + uintptr_t end = MIN(32, (uintptr_t)(hi - ptr)); + assert(start + start_offset <= end); + *p_mask = loadu256(p_mask_arr256[end - start - start_offset] + + 32 - start - start_offset); + copy_start = start; + copy_len = end - start; + } + + // Runt block from the buffer. + copyRuntBlock256(&u.val8[copy_start], &ptr[copy_start], copy_len); + + return u.val256; +} +#endif // HAVE_AVX2 + +#if defined(HAVE_AVX512) +// Note: p_mask is an output param that initialises a poison mask. +// u64a k = ones_u64a << n' >> m'; // m' < n' +// *p_mask = set_mask_m512(~k); +// means p_mask is consist of: +// (n' - m') poison bytes "0xff" at the beginning, +// followed by (64 - n') valid bytes "0x00", +// then followed by the rest m' poison bytes "0xff". +// ptr >= lo: +// no history. +// for end/short zone, ptr==lo and start_offset==0 +// for start zone, see below +// lo ptr hi hi +// |----------|-------|----------------|............| +// -start 0 -start+offset MIN(avail,64) +// p_mask ffff..ff0000...........00ffff.......... +// ptr < lo: +// only start zone. +// history +// ptr lo hi hi +// |----------|-------|----------------|............| +// 0 start start+offset end(<=64) +// p_mask ffff.....ffffff..ff0000...........00ffff.......... +static really_inline +m512 vectoredLoad512(m512 *p_mask, const u8 *ptr, const size_t start_offset, + const u8 *lo, const u8 *hi, const u8 *hbuf, size_t hlen, + const u32 nMasks) { + m512 val; + + uintptr_t copy_start; + uintptr_t copy_len; + + if (ptr >= lo) { // short/end/start zone + uintptr_t start = (uintptr_t)(ptr - lo); + uintptr_t avail = (uintptr_t)(hi - ptr); + if (avail >= 64) { + assert(start_offset - start <= 64); + u64a k = ones_u64a << (start_offset - start); + *p_mask = set_mask_m512(~k); + return loadu512(ptr); + } + assert(start_offset - start <= avail); + u64a k = ones_u64a << (64 - avail + start_offset - start) + >> (64 - avail); + *p_mask = set_mask_m512(~k); + copy_start = 0; + copy_len = avail; + } else { //start zone + uintptr_t need = MIN((uintptr_t)(lo - ptr), + MIN(hlen, nMasks - 1)); + uintptr_t start = (uintptr_t)(lo - ptr); + u64a j = 0x7fffffffffffffffULL >> (63 - need) << (start - need); + val = loadu_maskz_m512(j, &hbuf[hlen - start]); + uintptr_t end = MIN(64, (uintptr_t)(hi - ptr)); + assert(start + start_offset <= end); + u64a k = ones_u64a << (64 - end + start + start_offset) >> (64 - end); + *p_mask = set_mask_m512(~k); + copy_start = start; + copy_len = end - start; + } + + assert(copy_len < 64); + assert(copy_len > 0); + u64a j = ones_u64a >> (64 - copy_len) << copy_start; + val = loadu_mask_m512(val, j, ptr); + + return val; +} +#endif // HAVE_AVX512 + +static really_inline +u64a getConfVal(const struct FDR_Runtime_Args *a, const u8 *ptr, u32 byte, + UNUSED CautionReason reason) { + u64a confVal = 0; + const u8 *buf = a->buf; + size_t len = a->len; + const u8 *confirm_loc = ptr + byte - 7; +#if defined(HAVE_AVX512VBMI) + if (likely(confirm_loc >= buf)) { +#else + if (likely(reason == NOT_CAUTIOUS || confirm_loc >= buf)) { +#endif + confVal = lv_u64a(confirm_loc, buf, buf + len); + } else { // r == VECTORING, confirm_loc < buf + u64a histBytes = a->histBytes; + confVal = lv_u64a_ce(confirm_loc, buf, buf + len); + // stitch together confVal and history + u32 overhang = buf - confirm_loc; + histBytes >>= 64 - (overhang * 8); + confVal |= histBytes; + } + return confVal; +} + +static really_inline +void do_confWithBit_teddy(TEDDY_CONF_TYPE *conf, u8 bucket, u8 offset, + const u32 *confBase, CautionReason reason, + const struct FDR_Runtime_Args *a, const u8 *ptr, + hwlmcb_rv_t *control, u32 *last_match) { + do { + u32 bit = TEDDY_FIND_AND_CLEAR_LSB(conf); + u32 byte = bit / bucket + offset; + u32 idx = bit % bucket; + u32 cf = confBase[idx]; + if (!cf) { + continue; + } + const struct FDRConfirm *fdrc = (const struct FDRConfirm *) + ((const u8 *)confBase + cf); + if (!(fdrc->groups & *control)) { + continue; + } + u64a tmp = 0; + u64a confVal = getConfVal(a, ptr, byte, reason); + confWithBit(fdrc, a, ptr - a->buf + byte, control, + last_match, confVal, &tmp, 0); + } while (unlikely(*conf)); +} + +static really_inline +const m128 *getMaskBase(const struct Teddy *teddy) { + return (const m128 *)((const u8 *)teddy + ROUNDUP_CL(sizeof(struct Teddy))); +} + +static really_inline +const u64a *getReinforcedMaskBase(const struct Teddy *teddy, u8 numMask) { + return (const u64a *)((const u8 *)getMaskBase(teddy) + + ROUNDUP_CL(2 * numMask * sizeof(m128))); +} + +static really_inline +const u32 *getConfBase(const struct Teddy *teddy) { + return (const u32 *)((const u8 *)teddy + teddy->confOffset); +} + +#endif /* TEDDY_RUNTIME_COMMON_H_ */ diff --git a/regex/hs.h b/regex/hs.h new file mode 100644 index 000000000..2fe5d248b --- /dev/null +++ b/regex/hs.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2015-2020, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef HS_H_ +#define HS_H_ + +/** + * @file + * @brief The complete Hyperscan API definition. + * + * Hyperscan is a high speed regular expression engine. + * + * This header includes both the Hyperscan compiler and runtime components. See + * the individual component headers for documentation. + */ + +/* The current Hyperscan version information. */ + +#define HS_MAJOR 5 +#define HS_MINOR 4 +#define HS_PATCH 0 + +#include "hs_compile.h" +#include "hs_runtime.h" + +#endif /* HS_H_ */ diff --git a/regex/hs_common.h b/regex/hs_common.h new file mode 100644 index 000000000..8366d0018 --- /dev/null +++ b/regex/hs_common.h @@ -0,0 +1,600 @@ +/* + * Copyright (c) 2015-2019, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef HS_COMMON_H_ +#define HS_COMMON_H_ + +#if defined(_WIN32) +#define HS_CDECL __cdecl +#else +#define HS_CDECL +#endif +#ifndef __KERNEL__ +#include +#else +#include +#endif + +/** + * @file + * @brief The Hyperscan common API definition. + * + * Hyperscan is a high speed regular expression engine. + * + * This header contains functions available to both the Hyperscan compiler and + * runtime. + */ + +#ifdef __cplusplus +extern "C" +{ +#endif + +struct hs_database; + +/** + * A Hyperscan pattern database. + * + * Generated by one of the Hyperscan compiler functions: + * - @ref hs_compile() + * - @ref hs_compile_multi() + * - @ref hs_compile_ext_multi() + */ +typedef struct hs_database hs_database_t; + +/** + * A type for errors returned by Hyperscan functions. + */ +typedef int hs_error_t; + +/** + * Free a compiled pattern database. + * + * The free callback set by @ref hs_set_database_allocator() (or @ref + * hs_set_allocator()) will be used by this function. + * + * @param db + * A compiled pattern database. NULL may also be safely provided, in which + * case the function does nothing. + * + * @return + * @ref HS_SUCCESS on success, other values on failure. + */ +hs_error_t HS_CDECL hs_free_database(hs_database_t *db); + +/** + * Serialize a pattern database to a stream of bytes. + * + * The allocator callback set by @ref hs_set_misc_allocator() (or @ref + * hs_set_allocator()) will be used by this function. + * + * @param db + * A compiled pattern database. + * + * @param bytes + * On success, a pointer to an array of bytes will be returned here. + * These bytes can be subsequently relocated or written to disk. The + * caller is responsible for freeing this block. + * + * @param length + * On success, the number of bytes in the generated byte array will be + * returned here. + * + * @return + * @ref HS_SUCCESS on success, @ref HS_NOMEM if the byte array cannot be + * allocated, other values may be returned if errors are detected. + */ +hs_error_t HS_CDECL hs_serialize_database(const hs_database_t *db, char **bytes, + size_t *length); + +/** + * Reconstruct a pattern database from a stream of bytes previously generated + * by @ref hs_serialize_database(). + * + * This function will allocate sufficient space for the database using the + * allocator set with @ref hs_set_database_allocator() (or @ref + * hs_set_allocator()); to use a pre-allocated region of memory, use the @ref + * hs_deserialize_database_at() function. + * + * @param bytes + * A byte array generated by @ref hs_serialize_database() representing a + * compiled pattern database. + * + * @param length + * The length of the byte array generated by @ref hs_serialize_database(). + * This should be the same value as that returned by @ref + * hs_serialize_database(). + * + * @param db + * On success, a pointer to a newly allocated @ref hs_database_t will be + * returned here. This database can then be used for scanning, and + * eventually freed by the caller using @ref hs_free_database(). + * + * @return + * @ref HS_SUCCESS on success, other values on failure. + */ +hs_error_t HS_CDECL hs_deserialize_database(const char *bytes, + const size_t length, + hs_database_t **db); + +/** + * Reconstruct a pattern database from a stream of bytes previously generated + * by @ref hs_serialize_database() at a given memory location. + * + * This function (unlike @ref hs_deserialize_database()) will write the + * reconstructed database to the memory location given in the @p db parameter. + * The amount of space required at this location can be determined with the + * @ref hs_serialized_database_size() function. + * + * @param bytes + * A byte array generated by @ref hs_serialize_database() representing a + * compiled pattern database. + * + * @param length + * The length of the byte array generated by @ref hs_serialize_database(). + * This should be the same value as that returned by @ref + * hs_serialize_database(). + * + * @param db + * Pointer to an 8-byte aligned block of memory of sufficient size to hold + * the deserialized database. On success, the reconstructed database will + * be written to this location. This database can then be used for pattern + * matching. The user is responsible for freeing this memory; the @ref + * hs_free_database() call should not be used. + * + * @return + * @ref HS_SUCCESS on success, other values on failure. + */ +hs_error_t HS_CDECL hs_deserialize_database_at(const char *bytes, + const size_t length, + hs_database_t *db); + +/** + * Provides the size of the stream state allocated by a single stream opened + * against the given database. + * + * @param database + * Pointer to a compiled (streaming mode) pattern database. + * + * @param stream_size + * On success, the size in bytes of an individual stream opened against the + * given database is placed in this parameter. + * + * @return + * @ref HS_SUCCESS on success, other values on failure. + */ +hs_error_t HS_CDECL hs_stream_size(const hs_database_t *database, + size_t *stream_size); + +/** + * Provides the size of the given database in bytes. + * + * @param database + * Pointer to compiled pattern database. + * + * @param database_size + * On success, the size of the compiled database in bytes is placed in this + * parameter. + * + * @return + * @ref HS_SUCCESS on success, other values on failure. + */ +hs_error_t HS_CDECL hs_database_size(const hs_database_t *database, + size_t *database_size); + +/** + * Utility function for reporting the size that would be required by a + * database if it were deserialized. + * + * This can be used to allocate a shared memory region or other "special" + * allocation prior to deserializing with the @ref hs_deserialize_database_at() + * function. + * + * @param bytes + * Pointer to a byte array generated by @ref hs_serialize_database() + * representing a compiled pattern database. + * + * @param length + * The length of the byte array generated by @ref hs_serialize_database(). + * This should be the same value as that returned by @ref + * hs_serialize_database(). + * + * @param deserialized_size + * On success, the size of the compiled database that would be generated + * by @ref hs_deserialize_database_at() is returned here. + * + * @return + * @ref HS_SUCCESS on success, other values on failure. + */ +hs_error_t HS_CDECL hs_serialized_database_size(const char *bytes, + const size_t length, + size_t *deserialized_size); + +/** + * Utility function providing information about a database. + * + * @param database + * Pointer to a compiled database. + * + * @param info + * On success, a string containing the version and platform information for + * the supplied database is placed in the parameter. The string is + * allocated using the allocator supplied in @ref hs_set_misc_allocator() + * (or malloc() if no allocator was set) and should be freed by the caller. + * + * @return + * @ref HS_SUCCESS on success, other values on failure. + */ +hs_error_t HS_CDECL hs_database_info(const hs_database_t *database, + char **info); + +/** + * Utility function providing information about a serialized database. + * + * @param bytes + * Pointer to a serialized database. + * + * @param length + * Length in bytes of the serialized database. + * + * @param info + * On success, a string containing the version and platform information + * for the supplied serialized database is placed in the parameter. The + * string is allocated using the allocator supplied in @ref + * hs_set_misc_allocator() (or malloc() if no allocator was set) and + * should be freed by the caller. + * + * @return + * @ref HS_SUCCESS on success, other values on failure. + */ +hs_error_t HS_CDECL hs_serialized_database_info(const char *bytes, + size_t length, char **info); + +/** + * The type of the callback function that will be used by Hyperscan to allocate + * more memory at runtime as required, for example in @ref hs_open_stream() to + * allocate stream state. + * + * If Hyperscan is to be used in a multi-threaded, or similarly concurrent + * environment, the allocation function will need to be re-entrant, or + * similarly safe for concurrent use. + * + * @param size + * The number of bytes to allocate. + * @return + * A pointer to the region of memory allocated, or NULL on error. + */ +typedef void *(HS_CDECL *hs_alloc_t)(size_t size); + +/** + * The type of the callback function that will be used by Hyperscan to free + * memory regions previously allocated using the @ref hs_alloc_t function. + * + * @param ptr + * The region of memory to be freed. + */ +typedef void (HS_CDECL *hs_free_t)(void *ptr); + +/** + * Set the allocate and free functions used by Hyperscan for allocating + * memory at runtime for stream state, scratch space, database bytecode, + * and various other data structure returned by the Hyperscan API. + * + * The function is equivalent to calling @ref hs_set_stream_allocator(), + * @ref hs_set_scratch_allocator(), @ref hs_set_database_allocator() and + * @ref hs_set_misc_allocator() with the provided parameters. + * + * This call will override any previous allocators that have been set. + * + * Note: there is no way to change the allocator used for temporary objects + * created during the various compile calls (@ref hs_compile(), @ref + * hs_compile_multi(), @ref hs_compile_ext_multi()). + * + * @param alloc_func + * A callback function pointer that allocates memory. This function must + * return memory suitably aligned for the largest representable data type + * on this platform. + * + * @param free_func + * A callback function pointer that frees allocated memory. + * + * @return + * @ref HS_SUCCESS on success, other values on failure. + */ +hs_error_t HS_CDECL hs_set_allocator(hs_alloc_t alloc_func, + hs_free_t free_func); + +/** + * Set the allocate and free functions used by Hyperscan for allocating memory + * for database bytecode produced by the compile calls (@ref hs_compile(), @ref + * hs_compile_multi(), @ref hs_compile_ext_multi()) and by database + * deserialization (@ref hs_deserialize_database()). + * + * If no database allocation functions are set, or if NULL is used in place of + * both parameters, then memory allocation will default to standard methods + * (such as the system malloc() and free() calls). + * + * This call will override any previous database allocators that have been set. + * + * Note: the database allocator may also be set by calling @ref + * hs_set_allocator(). + * + * Note: there is no way to change how temporary objects created during the + * various compile calls (@ref hs_compile(), @ref hs_compile_multi(), @ref + * hs_compile_ext_multi()) are allocated. + * + * @param alloc_func + * A callback function pointer that allocates memory. This function must + * return memory suitably aligned for the largest representable data type + * on this platform. + * + * @param free_func + * A callback function pointer that frees allocated memory. + * + * @return + * @ref HS_SUCCESS on success, other values on failure. + */ +hs_error_t HS_CDECL hs_set_database_allocator(hs_alloc_t alloc_func, + hs_free_t free_func); + +/** + * Set the allocate and free functions used by Hyperscan for allocating memory + * for items returned by the Hyperscan API such as @ref hs_compile_error_t, @ref + * hs_expr_info_t and serialized databases. + * + * If no misc allocation functions are set, or if NULL is used in place of both + * parameters, then memory allocation will default to standard methods (such as + * the system malloc() and free() calls). + * + * This call will override any previous misc allocators that have been set. + * + * Note: the misc allocator may also be set by calling @ref hs_set_allocator(). + * + * @param alloc_func + * A callback function pointer that allocates memory. This function must + * return memory suitably aligned for the largest representable data type + * on this platform. + * + * @param free_func + * A callback function pointer that frees allocated memory. + * + * @return + * @ref HS_SUCCESS on success, other values on failure. + */ +hs_error_t HS_CDECL hs_set_misc_allocator(hs_alloc_t alloc_func, + hs_free_t free_func); + +/** + * Set the allocate and free functions used by Hyperscan for allocating memory + * for scratch space by @ref hs_alloc_scratch() and @ref hs_clone_scratch(). + * + * If no scratch allocation functions are set, or if NULL is used in place of + * both parameters, then memory allocation will default to standard methods + * (such as the system malloc() and free() calls). + * + * This call will override any previous scratch allocators that have been set. + * + * Note: the scratch allocator may also be set by calling @ref + * hs_set_allocator(). + * + * @param alloc_func + * A callback function pointer that allocates memory. This function must + * return memory suitably aligned for the largest representable data type + * on this platform. + * + * @param free_func + * A callback function pointer that frees allocated memory. + * + * @return + * @ref HS_SUCCESS on success, other values on failure. + */ +hs_error_t HS_CDECL hs_set_scratch_allocator(hs_alloc_t alloc_func, + hs_free_t free_func); + +/** + * Set the allocate and free functions used by Hyperscan for allocating memory + * for stream state by @ref hs_open_stream(). + * + * If no stream allocation functions are set, or if NULL is used in place of + * both parameters, then memory allocation will default to standard methods + * (such as the system malloc() and free() calls). + * + * This call will override any previous stream allocators that have been set. + * + * Note: the stream allocator may also be set by calling @ref + * hs_set_allocator(). + * + * @param alloc_func + * A callback function pointer that allocates memory. This function must + * return memory suitably aligned for the largest representable data type + * on this platform. + * + * @param free_func + * A callback function pointer that frees allocated memory. + * + * @return + * @ref HS_SUCCESS on success, other values on failure. + */ +hs_error_t HS_CDECL hs_set_stream_allocator(hs_alloc_t alloc_func, + hs_free_t free_func); + +/** + * Utility function for identifying this release version. + * + * @return + * A string containing the version number of this release build and the + * date of the build. It is allocated statically, so it does not need to + * be freed by the caller. + */ +const char * HS_CDECL hs_version(void); + +/** + * Utility function to test the current system architecture. + * + * Hyperscan requires the Supplemental Streaming SIMD Extensions 3 instruction + * set. This function can be called on any x86 platform to determine if the + * system provides the required instruction set. + * + * This function does not test for more advanced features if Hyperscan has + * been built for a more specific architecture, for example the AVX2 + * instruction set. + * + * @return + * @ref HS_SUCCESS on success, @ref HS_ARCH_ERROR if system does not + * support Hyperscan. + */ +hs_error_t HS_CDECL hs_valid_platform(void); + +/** + * @defgroup HS_ERROR hs_error_t values + * + * @{ + */ + +/** + * The engine completed normally. + */ +#define HS_SUCCESS 0 + +/** + * A parameter passed to this function was invalid. + * + * This error is only returned in cases where the function can detect an + * invalid parameter -- it cannot be relied upon to detect (for example) + * pointers to freed memory or other invalid data. + */ +#define HS_INVALID (-1) + +/** + * A memory allocation failed. + */ +#define HS_NOMEM (-2) + +/** + * The engine was terminated by callback. + * + * This return value indicates that the target buffer was partially scanned, + * but that the callback function requested that scanning cease after a match + * was located. + */ +#define HS_SCAN_TERMINATED (-3) + +/** + * The pattern compiler failed, and the @ref hs_compile_error_t should be + * inspected for more detail. + */ +#define HS_COMPILER_ERROR (-4) + +/** + * The given database was built for a different version of Hyperscan. + */ +#define HS_DB_VERSION_ERROR (-5) + +/** + * The given database was built for a different platform (i.e., CPU type). + */ +#define HS_DB_PLATFORM_ERROR (-6) + +/** + * The given database was built for a different mode of operation. This error + * is returned when streaming calls are used with a block or vectored database + * and vice versa. + */ +#define HS_DB_MODE_ERROR (-7) + +/** + * A parameter passed to this function was not correctly aligned. + */ +#define HS_BAD_ALIGN (-8) + +/** + * The memory allocator (either malloc() or the allocator set with @ref + * hs_set_allocator()) did not correctly return memory suitably aligned for the + * largest representable data type on this platform. + */ +#define HS_BAD_ALLOC (-9) + +/** + * The scratch region was already in use. + * + * This error is returned when Hyperscan is able to detect that the scratch + * region given is already in use by another Hyperscan API call. + * + * A separate scratch region, allocated with @ref hs_alloc_scratch() or @ref + * hs_clone_scratch(), is required for every concurrent caller of the Hyperscan + * API. + * + * For example, this error might be returned when @ref hs_scan() has been + * called inside a callback delivered by a currently-executing @ref hs_scan() + * call using the same scratch region. + * + * Note: Not all concurrent uses of scratch regions may be detected. This error + * is intended as a best-effort debugging tool, not a guarantee. + */ +#define HS_SCRATCH_IN_USE (-10) + +/** + * Unsupported CPU architecture. + * + * This error is returned when Hyperscan is able to detect that the current + * system does not support the required instruction set. + * + * At a minimum, Hyperscan requires Supplemental Streaming SIMD Extensions 3 + * (SSSE3). + */ +#define HS_ARCH_ERROR (-11) + +/** + * Provided buffer was too small. + * + * This error indicates that there was insufficient space in the buffer. The + * call should be repeated with a larger provided buffer. + * + * Note: in this situation, it is normal for the amount of space required to be + * returned in the same manner as the used space would have been returned if the + * call was successful. + */ +#define HS_INSUFFICIENT_SPACE (-12) + +/** + * Unexpected internal error. + * + * This error indicates that there was unexpected matching behaviors. This + * could be related to invalid usage of stream and scratch space or invalid memory + * operations by users. + * + */ +#define HS_UNKNOWN_ERROR (-13) + +/** @} */ + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* HS_COMMON_H_ */ diff --git a/regex/hs_compile.h b/regex/hs_compile.h new file mode 100644 index 000000000..b318c29db --- /dev/null +++ b/regex/hs_compile.h @@ -0,0 +1,1224 @@ +/* + * Copyright (c) 2015-2020, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef HS_COMPILE_H_ +#define HS_COMPILE_H_ + +/** + * @file + * @brief The Hyperscan compiler API definition. + * + * Hyperscan is a high speed regular expression engine. + * + * This header contains functions for compiling regular expressions into + * Hyperscan databases that can be used by the Hyperscan runtime. + */ + +#include "hs_common.h" + +#ifdef __cplusplus +extern "C" +{ +#endif + +/** + * A type containing error details that is returned by the compile calls (@ref + * hs_compile(), @ref hs_compile_multi() and @ref hs_compile_ext_multi()) on + * failure. The caller may inspect the values returned in this type to + * determine the cause of failure. + * + * Common errors generated during the compile process include: + * + * - *Invalid parameter* + * + * An invalid argument was specified in the compile call. + * + * - *Unrecognised flag* + * + * An unrecognised value was passed in the flags argument. + * + * - *Pattern matches empty buffer* + * + * By default, Hyperscan only supports patterns that will *always* + * consume at least one byte of input. Patterns that do not have this + * property (such as `/(abc)?/`) will produce this error unless + * the @ref HS_FLAG_ALLOWEMPTY flag is supplied. Note that such + * patterns will produce a match for *every* byte when scanned. + * + * - *Embedded anchors not supported* + * + * Hyperscan only supports the use of anchor meta-characters (such as + * `^` and `$`) in patterns where they could *only* match + * at the start or end of a buffer. A pattern containing an embedded + * anchor, such as `/abc^def/`, can never match, as there is no + * way for `abc` to precede the start of the data stream. + * + * - *Bounded repeat is too large* + * + * The pattern contains a repeated construct with very large finite + * bounds. + * + * - *Unsupported component type* + * + * An unsupported PCRE construct was used in the pattern. + * + * - *Unable to generate bytecode* + * + * This error indicates that Hyperscan was unable to compile a pattern + * that is syntactically valid. The most common cause is a pattern that is + * very long and complex or contains a large repeated subpattern. + * + * - *Unable to allocate memory* + * + * The library was unable to allocate temporary storage used during + * compilation time. + * + * - *Allocator returned misaligned memory* + * + * The memory allocator (either malloc() or the allocator set with @ref + * hs_set_allocator()) did not correctly return memory suitably aligned + * for the largest representable data type on this platform. + * + * - *Internal error* + * + * An unexpected error occurred: if this error is reported, please contact + * the Hyperscan team with a description of the situation. + */ +typedef struct hs_compile_error { + /** + * A human-readable error message describing the error. + */ + char *message; + + /** + * The zero-based number of the expression that caused the error (if this + * can be determined). If the error is not specific to an expression, then + * this value will be less than zero. + */ + int expression; +} hs_compile_error_t; + +/** + * A type containing information on the target platform which may optionally be + * provided to the compile calls (@ref hs_compile(), @ref hs_compile_multi(), + * @ref hs_compile_ext_multi()). + * + * A hs_platform_info structure may be populated for the current platform by + * using the @ref hs_populate_platform() call. + */ +typedef struct hs_platform_info { + /** + * Information about the target platform which may be used to guide the + * optimisation process of the compile. + * + * Use of this field does not limit the processors that the resulting + * database can run on, but may impact the performance of the resulting + * database. + */ + unsigned int tune; + + /** + * Relevant CPU features available on the target platform + * + * This value may be produced by combining HS_CPU_FEATURE_* flags (such as + * @ref HS_CPU_FEATURES_AVX2). Multiple CPU features may be or'ed together + * to produce the value. + */ + unsigned long long cpu_features; + + /** + * Reserved for future use. + */ + unsigned long long reserved1; + + /** + * Reserved for future use. + */ + unsigned long long reserved2; +} hs_platform_info_t; + +/** + * A type containing information related to an expression that is returned by + * @ref hs_expression_info() or @ref hs_expression_ext_info. + */ +typedef struct hs_expr_info { + /** + * The minimum length in bytes of a match for the pattern. + * + * Note: in some cases when using advanced features to suppress matches + * (such as extended parameters or the @ref HS_FLAG_SINGLEMATCH flag) this + * may represent a conservative lower bound for the true minimum length of + * a match. + */ + unsigned int min_width; + + /** + * The maximum length in bytes of a match for the pattern. If the pattern + * has an unbounded maximum length, this will be set to the maximum value + * of an unsigned int (UINT_MAX). + * + * Note: in some cases when using advanced features to suppress matches + * (such as extended parameters or the @ref HS_FLAG_SINGLEMATCH flag) this + * may represent a conservative upper bound for the true maximum length of + * a match. + */ + unsigned int max_width; + + /** + * Whether this expression can produce matches that are not returned in + * order, such as those produced by assertions. Zero if false, non-zero if + * true. + */ + char unordered_matches; + + /** + * Whether this expression can produce matches at end of data (EOD). In + * streaming mode, EOD matches are raised during @ref hs_close_stream(), + * since it is only when @ref hs_close_stream() is called that the EOD + * location is known. Zero if false, non-zero if true. + * + * Note: trailing `\b` word boundary assertions may also result in EOD + * matches as end-of-data can act as a word boundary. + */ + char matches_at_eod; + + /** + * Whether this expression can *only* produce matches at end of data (EOD). + * In streaming mode, all matches for this expression are raised during + * @ref hs_close_stream(). Zero if false, non-zero if true. + */ + char matches_only_at_eod; +} hs_expr_info_t; + +/** + * A structure containing additional parameters related to an expression, + * passed in at build time to @ref hs_compile_ext_multi() or @ref + * hs_expression_ext_info. + * + * These parameters allow the set of matches produced by a pattern to be + * constrained at compile time, rather than relying on the application to + * process unwanted matches at runtime. + */ +typedef struct hs_expr_ext { + /** + * Flags governing which parts of this structure are to be used by the + * compiler. See @ref HS_EXT_FLAG. + */ + unsigned long long flags; + + /** + * The minimum end offset in the data stream at which this expression + * should match successfully. To use this parameter, set the + * @ref HS_EXT_FLAG_MIN_OFFSET flag in the hs_expr_ext::flags field. + */ + unsigned long long min_offset; + + /** + * The maximum end offset in the data stream at which this expression + * should match successfully. To use this parameter, set the + * @ref HS_EXT_FLAG_MAX_OFFSET flag in the hs_expr_ext::flags field. + */ + unsigned long long max_offset; + + /** + * The minimum match length (from start to end) required to successfully + * match this expression. To use this parameter, set the + * @ref HS_EXT_FLAG_MIN_LENGTH flag in the hs_expr_ext::flags field. + */ + unsigned long long min_length; + + /** + * Allow patterns to approximately match within this edit distance. To use + * this parameter, set the @ref HS_EXT_FLAG_EDIT_DISTANCE flag in the + * hs_expr_ext::flags field. + */ + unsigned edit_distance; + + /** + * Allow patterns to approximately match within this Hamming distance. To + * use this parameter, set the @ref HS_EXT_FLAG_HAMMING_DISTANCE flag in the + * hs_expr_ext::flags field. + */ + unsigned hamming_distance; +} hs_expr_ext_t; + +/** + * @defgroup HS_EXT_FLAG hs_expr_ext_t flags + * + * These flags are used in @ref hs_expr_ext_t::flags to indicate which fields + * are used. + * + * @{ + */ + +/** Flag indicating that the hs_expr_ext::min_offset field is used. */ +#define HS_EXT_FLAG_MIN_OFFSET 1ULL + +/** Flag indicating that the hs_expr_ext::max_offset field is used. */ +#define HS_EXT_FLAG_MAX_OFFSET 2ULL + +/** Flag indicating that the hs_expr_ext::min_length field is used. */ +#define HS_EXT_FLAG_MIN_LENGTH 4ULL + +/** Flag indicating that the hs_expr_ext::edit_distance field is used. */ +#define HS_EXT_FLAG_EDIT_DISTANCE 8ULL + +/** Flag indicating that the hs_expr_ext::hamming_distance field is used. */ +#define HS_EXT_FLAG_HAMMING_DISTANCE 16ULL + +/** @} */ + +/** + * The basic regular expression compiler. + * + * This is the function call with which an expression is compiled into a + * Hyperscan database which can be passed to the runtime functions (such as + * @ref hs_scan(), @ref hs_open_stream(), etc.) + * + * @param expression + * The NULL-terminated expression to parse. Note that this string must + * represent ONLY the pattern to be matched, with no delimiters or flags; + * any global flags should be specified with the @p flags argument. For + * example, the expression `/abc?def/i` should be compiled by providing + * `abc?def` as the @p expression, and @ref HS_FLAG_CASELESS as the @a + * flags. + * + * @param flags + * Flags which modify the behaviour of the expression. Multiple flags may + * be used by ORing them together. Valid values are: + * - HS_FLAG_CASELESS - Matching will be performed case-insensitively. + * - HS_FLAG_DOTALL - Matching a `.` will not exclude newlines. + * - HS_FLAG_MULTILINE - `^` and `$` anchors match any newlines in data. + * - HS_FLAG_SINGLEMATCH - Only one match will be generated for the + * expression per stream. + * - HS_FLAG_ALLOWEMPTY - Allow expressions which can match against an + * empty string, such as `.*`. + * - HS_FLAG_UTF8 - Treat this pattern as a sequence of UTF-8 characters. + * - HS_FLAG_UCP - Use Unicode properties for character classes. + * - HS_FLAG_PREFILTER - Compile pattern in prefiltering mode. + * - HS_FLAG_SOM_LEFTMOST - Report the leftmost start of match offset + * when a match is found. + * - HS_FLAG_COMBINATION - Parse the expression in logical combination + * syntax. + * - HS_FLAG_QUIET - Ignore match reporting for this expression. Used for + * the sub-expressions in logical combinations. + * + * @param mode + * Compiler mode flags that affect the database as a whole. One of @ref + * HS_MODE_STREAM or @ref HS_MODE_BLOCK or @ref HS_MODE_VECTORED must be + * supplied, to select between the generation of a streaming, block or + * vectored database. In addition, other flags (beginning with HS_MODE_) + * may be supplied to enable specific features. See @ref HS_MODE_FLAG for + * more details. + * + * @param platform + * If not NULL, the platform structure is used to determine the target + * platform for the database. If NULL, a database suitable for running + * on the current host platform is produced. + * + * @param db + * On success, a pointer to the generated database will be returned in + * this parameter, or NULL on failure. The caller is responsible for + * deallocating the buffer using the @ref hs_free_database() function. + * + * @param error + * If the compile fails, a pointer to a @ref hs_compile_error_t will be + * returned, providing details of the error condition. The caller is + * responsible for deallocating the buffer using the @ref + * hs_free_compile_error() function. + * + * @return + * @ref HS_SUCCESS is returned on successful compilation; @ref + * HS_COMPILER_ERROR on failure, with details provided in the error + * parameter. + */ +hs_error_t HS_CDECL hs_compile(const char *expression, unsigned int flags, + unsigned int mode, + const hs_platform_info_t *platform, + hs_database_t **db, hs_compile_error_t **error); + +/** + * The multiple regular expression compiler. + * + * This is the function call with which a set of expressions is compiled into a + * database which can be passed to the runtime functions (such as @ref + * hs_scan(), @ref hs_open_stream(), etc.) Each expression can be labelled with + * a unique integer which is passed into the match callback to identify the + * pattern that has matched. + * + * @param expressions + * Array of NULL-terminated expressions to compile. Note that (as for @ref + * hs_compile()) these strings must contain only the pattern to be + * matched, with no delimiters or flags. For example, the expression + * `/abc?def/i` should be compiled by providing `abc?def` as the first + * string in the @p expressions array, and @ref HS_FLAG_CASELESS as the + * first value in the @p flags array. + * + * @param flags + * Array of flags which modify the behaviour of each expression. Multiple + * flags may be used by ORing them together. Specifying the NULL pointer + * in place of an array will set the flags value for all patterns to zero. + * Valid values are: + * - HS_FLAG_CASELESS - Matching will be performed case-insensitively. + * - HS_FLAG_DOTALL - Matching a `.` will not exclude newlines. + * - HS_FLAG_MULTILINE - `^` and `$` anchors match any newlines in data. + * - HS_FLAG_SINGLEMATCH - Only one match will be generated by patterns + * with this match id per stream. + * - HS_FLAG_ALLOWEMPTY - Allow expressions which can match against an + * empty string, such as `.*`. + * - HS_FLAG_UTF8 - Treat this pattern as a sequence of UTF-8 characters. + * - HS_FLAG_UCP - Use Unicode properties for character classes. + * - HS_FLAG_PREFILTER - Compile pattern in prefiltering mode. + * - HS_FLAG_SOM_LEFTMOST - Report the leftmost start of match offset + * when a match is found. + * - HS_FLAG_COMBINATION - Parse the expression in logical combination + * syntax. + * - HS_FLAG_QUIET - Ignore match reporting for this expression. Used for + * the sub-expressions in logical combinations. + * + * @param ids + * An array of integers specifying the ID number to be associated with the + * corresponding pattern in the expressions array. Specifying the NULL + * pointer in place of an array will set the ID value for all patterns to + * zero. + * + * @param elements + * The number of elements in the input arrays. + * + * @param mode + * Compiler mode flags that affect the database as a whole. One of @ref + * HS_MODE_STREAM or @ref HS_MODE_BLOCK or @ref HS_MODE_VECTORED must be + * supplied, to select between the generation of a streaming, block or + * vectored database. In addition, other flags (beginning with HS_MODE_) + * may be supplied to enable specific features. See @ref HS_MODE_FLAG for + * more details. + * + * @param platform + * If not NULL, the platform structure is used to determine the target + * platform for the database. If NULL, a database suitable for running + * on the current host platform is produced. + * + * @param db + * On success, a pointer to the generated database will be returned in + * this parameter, or NULL on failure. The caller is responsible for + * deallocating the buffer using the @ref hs_free_database() function. + * + * @param error + * If the compile fails, a pointer to a @ref hs_compile_error_t will be + * returned, providing details of the error condition. The caller is + * responsible for deallocating the buffer using the @ref + * hs_free_compile_error() function. + * + * @return + * @ref HS_SUCCESS is returned on successful compilation; @ref + * HS_COMPILER_ERROR on failure, with details provided in the @p error + * parameter. + * + */ +hs_error_t HS_CDECL hs_compile_multi(const char *const *expressions, + const unsigned int *flags, + const unsigned int *ids, + unsigned int elements, unsigned int mode, + const hs_platform_info_t *platform, + hs_database_t **db, + hs_compile_error_t **error); + +/** + * The multiple regular expression compiler with extended parameter support. + * + * This function call compiles a group of expressions into a database in the + * same way as @ref hs_compile_multi(), but allows additional parameters to be + * specified via an @ref hs_expr_ext_t structure per expression. + * + * @param expressions + * Array of NULL-terminated expressions to compile. Note that (as for @ref + * hs_compile()) these strings must contain only the pattern to be + * matched, with no delimiters or flags. For example, the expression + * `/abc?def/i` should be compiled by providing `abc?def` as the first + * string in the @p expressions array, and @ref HS_FLAG_CASELESS as the + * first value in the @p flags array. + * + * @param flags + * Array of flags which modify the behaviour of each expression. Multiple + * flags may be used by ORing them together. Specifying the NULL pointer + * in place of an array will set the flags value for all patterns to zero. + * Valid values are: + * - HS_FLAG_CASELESS - Matching will be performed case-insensitively. + * - HS_FLAG_DOTALL - Matching a `.` will not exclude newlines. + * - HS_FLAG_MULTILINE - `^` and `$` anchors match any newlines in data. + * - HS_FLAG_SINGLEMATCH - Only one match will be generated by patterns + * with this match id per stream. + * - HS_FLAG_ALLOWEMPTY - Allow expressions which can match against an + * empty string, such as `.*`. + * - HS_FLAG_UTF8 - Treat this pattern as a sequence of UTF-8 characters. + * - HS_FLAG_UCP - Use Unicode properties for character classes. + * - HS_FLAG_PREFILTER - Compile pattern in prefiltering mode. + * - HS_FLAG_SOM_LEFTMOST - Report the leftmost start of match offset + * when a match is found. + * - HS_FLAG_COMBINATION - Parse the expression in logical combination + * syntax. + * - HS_FLAG_QUIET - Ignore match reporting for this expression. Used for + * the sub-expressions in logical combinations. + * + * @param ids + * An array of integers specifying the ID number to be associated with the + * corresponding pattern in the expressions array. Specifying the NULL + * pointer in place of an array will set the ID value for all patterns to + * zero. + * + * @param ext + * An array of pointers to filled @ref hs_expr_ext_t structures that + * define extended behaviour for each pattern. NULL may be specified if no + * extended behaviour is needed for an individual pattern, or in place of + * the whole array if it is not needed for any expressions. Memory used by + * these structures must be both allocated and freed by the caller. + * + * @param elements + * The number of elements in the input arrays. + * + * @param mode + * Compiler mode flags that affect the database as a whole. One of @ref + * HS_MODE_STREAM, @ref HS_MODE_BLOCK or @ref HS_MODE_VECTORED must be + * supplied, to select between the generation of a streaming, block or + * vectored database. In addition, other flags (beginning with HS_MODE_) + * may be supplied to enable specific features. See @ref HS_MODE_FLAG for + * more details. + * + * @param platform + * If not NULL, the platform structure is used to determine the target + * platform for the database. If NULL, a database suitable for running + * on the current host platform is produced. + * + * @param db + * On success, a pointer to the generated database will be returned in + * this parameter, or NULL on failure. The caller is responsible for + * deallocating the buffer using the @ref hs_free_database() function. + * + * @param error + * If the compile fails, a pointer to a @ref hs_compile_error_t will be + * returned, providing details of the error condition. The caller is + * responsible for deallocating the buffer using the @ref + * hs_free_compile_error() function. + * + * @return + * @ref HS_SUCCESS is returned on successful compilation; @ref + * HS_COMPILER_ERROR on failure, with details provided in the @p error + * parameter. + * + */ +hs_error_t HS_CDECL hs_compile_ext_multi(const char *const *expressions, + const unsigned int *flags, + const unsigned int *ids, + const hs_expr_ext_t *const *ext, + unsigned int elements, unsigned int mode, + const hs_platform_info_t *platform, + hs_database_t **db, hs_compile_error_t **error); + +/** + * The basic pure literal expression compiler. + * + * This is the function call with which a pure literal expression (not a + * common regular expression) is compiled into a Hyperscan database which + * can be passed to the runtime functions (such as @ref hs_scan(), + * @ref hs_open_stream(), etc.) + * + * @param expression + * The NULL-terminated expression to parse. Note that this string must + * represent ONLY the pattern to be matched, with no delimiters or flags; + * any global flags should be specified with the @p flags argument. For + * example, the expression `/abc?def/i` should be compiled by providing + * `abc?def` as the @p expression, and @ref HS_FLAG_CASELESS as the @a + * flags. Meanwhile, the string content shall be fully parsed in a literal + * sense without any regular grammars. For example, the @p expression + * `abc?` simply means a char sequence of `a`, `b`, `c`, and `?`. The `?` + * here doesn't mean 0 or 1 quantifier under regular semantics. + * + * @param flags + * Flags which modify the behaviour of the expression. Multiple flags may + * be used by ORing them together. Compared to @ref hs_compile(), fewer + * valid values are provided: + * - HS_FLAG_CASELESS - Matching will be performed case-insensitively. + * - HS_FLAG_SINGLEMATCH - Only one match will be generated for the + * expression per stream. + * - HS_FLAG_SOM_LEFTMOST - Report the leftmost start of match offset + * when a match is found. + * + * @param len + * The length of the text content of the pure literal expression. As the + * text content indicated by @p expression is treated as single character + * one by one, the special terminating character `\0` should be allowed + * to appear in expression, and not treated as a terminator for a string. + * Thus, the end of a pure literal expression cannot be indicated by + * identifying `\0`, but by counting to the expression length. + * + * @param mode + * Compiler mode flags that affect the database as a whole. One of @ref + * HS_MODE_STREAM or @ref HS_MODE_BLOCK or @ref HS_MODE_VECTORED must be + * supplied, to select between the generation of a streaming, block or + * vectored database. In addition, other flags (beginning with HS_MODE_) + * may be supplied to enable specific features. See @ref HS_MODE_FLAG for + * more details. + * + * @param platform + * If not NULL, the platform structure is used to determine the target + * platform for the database. If NULL, a database suitable for running + * on the current host platform is produced. + * + * @param db + * On success, a pointer to the generated database will be returned in + * this parameter, or NULL on failure. The caller is responsible for + * deallocating the buffer using the @ref hs_free_database() function. + * + * @param error + * If the compile fails, a pointer to a @ref hs_compile_error_t will be + * returned, providing details of the error condition. The caller is + * responsible for deallocating the buffer using the @ref + * hs_free_compile_error() function. + * + * @return + * @ref HS_SUCCESS is returned on successful compilation; @ref + * HS_COMPILER_ERROR on failure, with details provided in the error + * parameter. + */ +hs_error_t HS_CDECL hs_compile_lit(const char *expression, unsigned flags, + const size_t len, unsigned mode, + const hs_platform_info_t *platform, + hs_database_t **db, + hs_compile_error_t **error); +/** + * The multiple pure literal expression compiler. + * + * This is the function call with which a set of pure literal expressions is + * compiled into a database which can be passed to the runtime functions (such + * as @ref hs_scan(), @ref hs_open_stream(), etc.) Each expression can be + * labelled with a unique integer which is passed into the match callback to + * identify the pattern that has matched. + * + * @param expressions + * The NULL-terminated expression to parse. Note that this string must + * represent ONLY the pattern to be matched, with no delimiters or flags; + * any global flags should be specified with the @p flags argument. For + * example, the expression `/abc?def/i` should be compiled by providing + * `abc?def` as the @p expression, and @ref HS_FLAG_CASELESS as the @a + * flags. Meanwhile, the string content shall be fully parsed in a literal + * sense without any regular grammars. For example, the @p expression + * `abc?` simply means a char sequence of `a`, `b`, `c`, and `?`. The `?` + * here doesn't mean 0 or 1 quantifier under regular semantics. + * + * @param flags + * Array of flags which modify the behaviour of each expression. Multiple + * flags may be used by ORing them together. Specifying the NULL pointer + * in place of an array will set the flags value for all patterns to zero. + * Compared to @ref hs_compile_multi(), fewer valid values are provided: + * - HS_FLAG_CASELESS - Matching will be performed case-insensitively. + * - HS_FLAG_SINGLEMATCH - Only one match will be generated for the + * expression per stream. + * - HS_FLAG_SOM_LEFTMOST - Report the leftmost start of match offset + * when a match is found. + * + * @param ids + * An array of integers specifying the ID number to be associated with the + * corresponding pattern in the expressions array. Specifying the NULL + * pointer in place of an array will set the ID value for all patterns to + * zero. + * + * @param lens + * Array of lengths of the text content of each pure literal expression. + * As the text content indicated by @p expression is treated as single + * character one by one, the special terminating character `\0` should be + * allowed to appear in expression, and not treated as a terminator for a + * string. Thus, the end of a pure literal expression cannot be indicated + * by identifying `\0`, but by counting to the expression length. + * + * @param elements + * The number of elements in the input arrays. + * + * @param mode + * Compiler mode flags that affect the database as a whole. One of @ref + * HS_MODE_STREAM or @ref HS_MODE_BLOCK or @ref HS_MODE_VECTORED must be + * supplied, to select between the generation of a streaming, block or + * vectored database. In addition, other flags (beginning with HS_MODE_) + * may be supplied to enable specific features. See @ref HS_MODE_FLAG for + * more details. + * + * @param platform + * If not NULL, the platform structure is used to determine the target + * platform for the database. If NULL, a database suitable for running + * on the current host platform is produced. + * + * @param db + * On success, a pointer to the generated database will be returned in + * this parameter, or NULL on failure. The caller is responsible for + * deallocating the buffer using the @ref hs_free_database() function. + * + * @param error + * If the compile fails, a pointer to a @ref hs_compile_error_t will be + * returned, providing details of the error condition. The caller is + * responsible for deallocating the buffer using the @ref + * hs_free_compile_error() function. + * + * @return + * @ref HS_SUCCESS is returned on successful compilation; @ref + * HS_COMPILER_ERROR on failure, with details provided in the error + * parameter. + */ +hs_error_t HS_CDECL hs_compile_lit_multi(const char * const *expressions, + const unsigned *flags, + const unsigned *ids, + const size_t *lens, + unsigned elements, unsigned mode, + const hs_platform_info_t *platform, + hs_database_t **db, + hs_compile_error_t **error); + +/** + * Free an error structure generated by @ref hs_compile(), @ref + * hs_compile_multi() or @ref hs_compile_ext_multi(). + * + * @param error + * The @ref hs_compile_error_t to be freed. NULL may also be safely + * provided. + * + * @return + * @ref HS_SUCCESS on success, other values on failure. + */ +hs_error_t HS_CDECL hs_free_compile_error(hs_compile_error_t *error); + +/** + * Utility function providing information about a regular expression. The + * information provided in @ref hs_expr_info_t includes the minimum and maximum + * width of a pattern match. + * + * Note: successful analysis of an expression with this function does not imply + * that compilation of the same expression (via @ref hs_compile(), @ref + * hs_compile_multi() or @ref hs_compile_ext_multi()) would succeed. This + * function may return @ref HS_SUCCESS for regular expressions that Hyperscan + * cannot compile. + * + * Note: some per-pattern flags (such as @ref HS_FLAG_ALLOWEMPTY, @ref + * HS_FLAG_SOM_LEFTMOST) are accepted by this call, but as they do not affect + * the properties returned in the @ref hs_expr_info_t structure, they will not + * affect the outcome of this function. + * + * @param expression + * The NULL-terminated expression to parse. Note that this string must + * represent ONLY the pattern to be matched, with no delimiters or flags; + * any global flags should be specified with the @p flags argument. For + * example, the expression `/abc?def/i` should be compiled by providing + * `abc?def` as the @p expression, and @ref HS_FLAG_CASELESS as the @a + * flags. + * + * @param flags + * Flags which modify the behaviour of the expression. Multiple flags may + * be used by ORing them together. Valid values are: + * - HS_FLAG_CASELESS - Matching will be performed case-insensitively. + * - HS_FLAG_DOTALL - Matching a `.` will not exclude newlines. + * - HS_FLAG_MULTILINE - `^` and `$` anchors match any newlines in data. + * - HS_FLAG_SINGLEMATCH - Only one match will be generated by the + * expression per stream. + * - HS_FLAG_ALLOWEMPTY - Allow expressions which can match against an + * empty string, such as `.*`. + * - HS_FLAG_UTF8 - Treat this pattern as a sequence of UTF-8 characters. + * - HS_FLAG_UCP - Use Unicode properties for character classes. + * - HS_FLAG_PREFILTER - Compile pattern in prefiltering mode. + * - HS_FLAG_SOM_LEFTMOST - Report the leftmost start of match offset + * when a match is found. + * - HS_FLAG_COMBINATION - Parse the expression in logical combination + * syntax. + * - HS_FLAG_QUIET - Ignore match reporting for this expression. Used for + * the sub-expressions in logical combinations. + * + * @param info + * On success, a pointer to the pattern information will be returned in + * this parameter, or NULL on failure. This structure is allocated using + * the allocator supplied in @ref hs_set_allocator() (or malloc() if no + * allocator was set) and should be freed by the caller. + * + * @param error + * If the call fails, a pointer to a @ref hs_compile_error_t will be + * returned, providing details of the error condition. The caller is + * responsible for deallocating the buffer using the @ref + * hs_free_compile_error() function. + * + * @return + * @ref HS_SUCCESS is returned on successful compilation; @ref + * HS_COMPILER_ERROR on failure, with details provided in the error + * parameter. + */ +hs_error_t HS_CDECL hs_expression_info(const char *expression, + unsigned int flags, + hs_expr_info_t **info, + hs_compile_error_t **error); + +/** + * Utility function providing information about a regular expression, with + * extended parameter support. The information provided in @ref hs_expr_info_t + * includes the minimum and maximum width of a pattern match. + * + * Note: successful analysis of an expression with this function does not imply + * that compilation of the same expression (via @ref hs_compile(), @ref + * hs_compile_multi() or @ref hs_compile_ext_multi()) would succeed. This + * function may return @ref HS_SUCCESS for regular expressions that Hyperscan + * cannot compile. + * + * Note: some per-pattern flags (such as @ref HS_FLAG_ALLOWEMPTY, @ref + * HS_FLAG_SOM_LEFTMOST) are accepted by this call, but as they do not affect + * the properties returned in the @ref hs_expr_info_t structure, they will not + * affect the outcome of this function. + * + * @param expression + * The NULL-terminated expression to parse. Note that this string must + * represent ONLY the pattern to be matched, with no delimiters or flags; + * any global flags should be specified with the @p flags argument. For + * example, the expression `/abc?def/i` should be compiled by providing + * `abc?def` as the @p expression, and @ref HS_FLAG_CASELESS as the @a + * flags. + * + * @param flags + * Flags which modify the behaviour of the expression. Multiple flags may + * be used by ORing them together. Valid values are: + * - HS_FLAG_CASELESS - Matching will be performed case-insensitively. + * - HS_FLAG_DOTALL - Matching a `.` will not exclude newlines. + * - HS_FLAG_MULTILINE - `^` and `$` anchors match any newlines in data. + * - HS_FLAG_SINGLEMATCH - Only one match will be generated by the + * expression per stream. + * - HS_FLAG_ALLOWEMPTY - Allow expressions which can match against an + * empty string, such as `.*`. + * - HS_FLAG_UTF8 - Treat this pattern as a sequence of UTF-8 characters. + * - HS_FLAG_UCP - Use Unicode properties for character classes. + * - HS_FLAG_PREFILTER - Compile pattern in prefiltering mode. + * - HS_FLAG_SOM_LEFTMOST - Report the leftmost start of match offset + * when a match is found. + * - HS_FLAG_COMBINATION - Parse the expression in logical combination + * syntax. + * - HS_FLAG_QUIET - Ignore match reporting for this expression. Used for + * the sub-expressions in logical combinations. + * + * @param ext + * A pointer to a filled @ref hs_expr_ext_t structure that defines + * extended behaviour for this pattern. NULL may be specified if no + * extended parameters are needed. + * + * @param info + * On success, a pointer to the pattern information will be returned in + * this parameter, or NULL on failure. This structure is allocated using + * the allocator supplied in @ref hs_set_allocator() (or malloc() if no + * allocator was set) and should be freed by the caller. + * + * @param error + * If the call fails, a pointer to a @ref hs_compile_error_t will be + * returned, providing details of the error condition. The caller is + * responsible for deallocating the buffer using the @ref + * hs_free_compile_error() function. + * + * @return + * @ref HS_SUCCESS is returned on successful compilation; @ref + * HS_COMPILER_ERROR on failure, with details provided in the error + * parameter. + */ +hs_error_t HS_CDECL hs_expression_ext_info(const char *expression, + unsigned int flags, + const hs_expr_ext_t *ext, + hs_expr_info_t **info, + hs_compile_error_t **error); + +/** + * Populates the platform information based on the current host. + * + * @param platform + * On success, the pointed to structure is populated based on the current + * host. + * + * @return + * @ref HS_SUCCESS on success, other values on failure. + */ +hs_error_t HS_CDECL hs_populate_platform(hs_platform_info_t *platform); + +/** + * @defgroup HS_PATTERN_FLAG Pattern flags + * + * @{ + */ + +/** + * Compile flag: Set case-insensitive matching. + * + * This flag sets the expression to be matched case-insensitively by default. + * The expression may still use PCRE tokens (notably `(?i)` and + * `(?-i)`) to switch case-insensitive matching on and off. + */ +#define HS_FLAG_CASELESS 1 + +/** + * Compile flag: Matching a `.` will not exclude newlines. + * + * This flag sets any instances of the `.` token to match newline characters as + * well as all other characters. The PCRE specification states that the `.` + * token does not match newline characters by default, so without this flag the + * `.` token will not cross line boundaries. + */ +#define HS_FLAG_DOTALL 2 + +/** + * Compile flag: Set multi-line anchoring. + * + * This flag instructs the expression to make the `^` and `$` tokens match + * newline characters as well as the start and end of the stream. If this flag + * is not specified, the `^` token will only ever match at the start of a + * stream, and the `$` token will only ever match at the end of a stream within + * the guidelines of the PCRE specification. + */ +#define HS_FLAG_MULTILINE 4 + +/** + * Compile flag: Set single-match only mode. + * + * This flag sets the expression's match ID to match at most once. In streaming + * mode, this means that the expression will return only a single match over + * the lifetime of the stream, rather than reporting every match as per + * standard Hyperscan semantics. In block mode or vectored mode, only the first + * match for each invocation of @ref hs_scan() or @ref hs_scan_vector() will be + * returned. + * + * If multiple expressions in the database share the same match ID, then they + * either must all specify @ref HS_FLAG_SINGLEMATCH or none of them specify + * @ref HS_FLAG_SINGLEMATCH. If a group of expressions sharing a match ID + * specify the flag, then at most one match with the match ID will be generated + * per stream. + * + * Note: The use of this flag in combination with @ref HS_FLAG_SOM_LEFTMOST + * is not currently supported. + */ +#define HS_FLAG_SINGLEMATCH 8 + +/** + * Compile flag: Allow expressions that can match against empty buffers. + * + * This flag instructs the compiler to allow expressions that can match against + * empty buffers, such as `.?`, `.*`, `(a|)`. Since Hyperscan can return every + * possible match for an expression, such expressions generally execute very + * slowly; the default behaviour is to return an error when an attempt to + * compile one is made. Using this flag will force the compiler to allow such + * an expression. + */ +#define HS_FLAG_ALLOWEMPTY 16 + +/** + * Compile flag: Enable UTF-8 mode for this expression. + * + * This flag instructs Hyperscan to treat the pattern as a sequence of UTF-8 + * characters. The results of scanning invalid UTF-8 sequences with a Hyperscan + * library that has been compiled with one or more patterns using this flag are + * undefined. + */ +#define HS_FLAG_UTF8 32 + +/** + * Compile flag: Enable Unicode property support for this expression. + * + * This flag instructs Hyperscan to use Unicode properties, rather than the + * default ASCII interpretations, for character mnemonics like `\w` and `\s` as + * well as the POSIX character classes. It is only meaningful in conjunction + * with @ref HS_FLAG_UTF8. + */ +#define HS_FLAG_UCP 64 + +/** + * Compile flag: Enable prefiltering mode for this expression. + * + * This flag instructs Hyperscan to compile an "approximate" version of this + * pattern for use in a prefiltering application, even if Hyperscan does not + * support the pattern in normal operation. + * + * The set of matches returned when this flag is used is guaranteed to be a + * superset of the matches specified by the non-prefiltering expression. + * + * If the pattern contains pattern constructs not supported by Hyperscan (such + * as zero-width assertions, back-references or conditional references) these + * constructs will be replaced internally with broader constructs that may + * match more often. + * + * Furthermore, in prefiltering mode Hyperscan may simplify a pattern that + * would otherwise return a "Pattern too large" error at compile time, or for + * performance reasons (subject to the matching guarantee above). + * + * It is generally expected that the application will subsequently confirm + * prefilter matches with another regular expression matcher that can provide + * exact matches for the pattern. + * + * Note: The use of this flag in combination with @ref HS_FLAG_SOM_LEFTMOST + * is not currently supported. + */ +#define HS_FLAG_PREFILTER 128 + +/** + * Compile flag: Enable leftmost start of match reporting. + * + * This flag instructs Hyperscan to report the leftmost possible start of match + * offset when a match is reported for this expression. (By default, no start + * of match is returned.) + * + * For all the 3 modes, enabling this behaviour may reduce performance. And + * particularly, it may increase stream state requirements in streaming mode. + */ +#define HS_FLAG_SOM_LEFTMOST 256 + +/** + * Compile flag: Logical combination. + * + * This flag instructs Hyperscan to parse this expression as logical + * combination syntax. + * Logical constraints consist of operands, operators and parentheses. + * The operands are expression indices, and operators can be + * '!'(NOT), '&'(AND) or '|'(OR). + * For example: + * (101&102&103)|(104&!105) + * ((301|302)&303)&(304|305) + */ +#define HS_FLAG_COMBINATION 512 + +/** + * Compile flag: Don't do any match reporting. + * + * This flag instructs Hyperscan to ignore match reporting for this expression. + * It is designed to be used on the sub-expressions in logical combinations. + */ +#define HS_FLAG_QUIET 1024 + +/** @} */ + +/** + * @defgroup HS_CPU_FEATURES_FLAG CPU feature support flags + * + * @{ + */ + +/** + * CPU features flag - Intel(R) Advanced Vector Extensions 2 (Intel(R) AVX2) + * + * Setting this flag indicates that the target platform supports AVX2 + * instructions. + */ +#define HS_CPU_FEATURES_AVX2 (1ULL << 2) + +/** + * CPU features flag - Intel(R) Advanced Vector Extensions 512 (Intel(R) AVX512) + * + * Setting this flag indicates that the target platform supports AVX512 + * instructions, specifically AVX-512BW. Using AVX512 implies the use of AVX2. + */ +#define HS_CPU_FEATURES_AVX512 (1ULL << 3) + +/** + * CPU features flag - Intel(R) Advanced Vector Extensions 512 + * Vector Byte Manipulation Instructions (Intel(R) AVX512VBMI) + * + * Setting this flag indicates that the target platform supports AVX512VBMI + * instructions. Using AVX512VBMI implies the use of AVX512. + */ +#define HS_CPU_FEATURES_AVX512VBMI (1ULL << 4) + +/** @} */ + +/** + * @defgroup HS_TUNE_FLAG Tuning flags + * + * @{ + */ + +/** + * Tuning Parameter - Generic + * + * This indicates that the compiled database should not be tuned for any + * particular target platform. + */ +#define HS_TUNE_FAMILY_GENERIC 0 + +/** + * Tuning Parameter - Intel(R) microarchitecture code name Sandy Bridge + * + * This indicates that the compiled database should be tuned for the + * Sandy Bridge microarchitecture. + */ +#define HS_TUNE_FAMILY_SNB 1 + +/** + * Tuning Parameter - Intel(R) microarchitecture code name Ivy Bridge + * + * This indicates that the compiled database should be tuned for the + * Ivy Bridge microarchitecture. + */ +#define HS_TUNE_FAMILY_IVB 2 + +/** + * Tuning Parameter - Intel(R) microarchitecture code name Haswell + * + * This indicates that the compiled database should be tuned for the + * Haswell microarchitecture. + */ +#define HS_TUNE_FAMILY_HSW 3 + +/** + * Tuning Parameter - Intel(R) microarchitecture code name Silvermont + * + * This indicates that the compiled database should be tuned for the + * Silvermont microarchitecture. + */ +#define HS_TUNE_FAMILY_SLM 4 + +/** + * Tuning Parameter - Intel(R) microarchitecture code name Broadwell + * + * This indicates that the compiled database should be tuned for the + * Broadwell microarchitecture. + */ +#define HS_TUNE_FAMILY_BDW 5 + +/** + * Tuning Parameter - Intel(R) microarchitecture code name Skylake + * + * This indicates that the compiled database should be tuned for the + * Skylake microarchitecture. + */ +#define HS_TUNE_FAMILY_SKL 6 + +/** + * Tuning Parameter - Intel(R) microarchitecture code name Skylake Server + * + * This indicates that the compiled database should be tuned for the + * Skylake Server microarchitecture. + */ +#define HS_TUNE_FAMILY_SKX 7 + +/** + * Tuning Parameter - Intel(R) microarchitecture code name Goldmont + * + * This indicates that the compiled database should be tuned for the + * Goldmont microarchitecture. + */ +#define HS_TUNE_FAMILY_GLM 8 + +/** + * Tuning Parameter - Intel(R) microarchitecture code name Icelake + * + * This indicates that the compiled database should be tuned for the + * Icelake microarchitecture. + */ +#define HS_TUNE_FAMILY_ICL 9 + +/** + * Tuning Parameter - Intel(R) microarchitecture code name Icelake Server + * + * This indicates that the compiled database should be tuned for the + * Icelake Server microarchitecture. + */ +#define HS_TUNE_FAMILY_ICX 10 + +/** @} */ + +/** + * @defgroup HS_MODE_FLAG Compile mode flags + * + * The mode flags are used as values for the mode parameter of the various + * compile calls (@ref hs_compile(), @ref hs_compile_multi() and @ref + * hs_compile_ext_multi()). + * + * A mode value can be built by ORing these flag values together; the only + * required flag is one of @ref HS_MODE_BLOCK, @ref HS_MODE_STREAM or @ref + * HS_MODE_VECTORED. Other flags may be added to enable support for additional + * features. + * + * @{ + */ + +/** + * Compiler mode flag: Block scan (non-streaming) database. + */ +#define HS_MODE_BLOCK 1 + +/** + * Compiler mode flag: Alias for @ref HS_MODE_BLOCK. + */ +#define HS_MODE_NOSTREAM 1 + +/** + * Compiler mode flag: Streaming database. + */ +#define HS_MODE_STREAM 2 + +/** + * Compiler mode flag: Vectored scanning database. + */ +#define HS_MODE_VECTORED 4 + +/** + * Compiler mode flag: use full precision to track start of match offsets in + * stream state. + * + * This mode will use the most stream state per pattern, but will always return + * an accurate start of match offset regardless of how far back in the past it + * was found. + * + * One of the SOM_HORIZON modes must be selected to use the @ref + * HS_FLAG_SOM_LEFTMOST expression flag. + */ +#define HS_MODE_SOM_HORIZON_LARGE (1U << 24) + +/** + * Compiler mode flag: use medium precision to track start of match offsets in + * stream state. + * + * This mode will use less stream state than @ref HS_MODE_SOM_HORIZON_LARGE and + * will limit start of match accuracy to offsets within 2^32 bytes of the + * end of match offset reported. + * + * One of the SOM_HORIZON modes must be selected to use the @ref + * HS_FLAG_SOM_LEFTMOST expression flag. + */ +#define HS_MODE_SOM_HORIZON_MEDIUM (1U << 25) + +/** + * Compiler mode flag: use limited precision to track start of match offsets in + * stream state. + * + * This mode will use less stream state than @ref HS_MODE_SOM_HORIZON_LARGE and + * will limit start of match accuracy to offsets within 2^16 bytes of the + * end of match offset reported. + * + * One of the SOM_HORIZON modes must be selected to use the @ref + * HS_FLAG_SOM_LEFTMOST expression flag. + */ +#define HS_MODE_SOM_HORIZON_SMALL (1U << 26) + +/** @} */ + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* HS_COMPILE_H_ */ diff --git a/regex/hs_internal.h b/regex/hs_internal.h new file mode 100644 index 000000000..adf07b22c --- /dev/null +++ b/regex/hs_internal.h @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2019, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Internal-use only definitions. Available to internal tools. + */ + +#ifndef HS_INTERNAL_H +#define HS_INTERNAL_H + +#include "ue2common.h" +#include "hs.h" + +#ifdef __cplusplus + +namespace ue2 { + +struct Grey; + +/** \brief Internal use only: takes a Grey argument so that we can use it in + * tools. */ +hs_error_t hs_compile_multi_int(const char *const *expressions, + const unsigned *flags, const unsigned *ids, + const hs_expr_ext *const *ext, + unsigned elements, unsigned mode, + const hs_platform_info_t *platform, + hs_database_t **db, + hs_compile_error_t **comp_error, const Grey &g); + +/** \brief Internal use only: takes a Grey argument so that we can use it in + * tools. */ +hs_error_t hs_compile_lit_multi_int(const char *const *expressions, + const unsigned *flags, const unsigned *ids, + const hs_expr_ext *const *ext, + const size_t *lens, unsigned elements, + unsigned mode, + const hs_platform_info_t *platform, + hs_database_t **db, + hs_compile_error_t **comp_error, + const Grey &g); +} // namespace ue2 + +extern "C" +{ +#endif + +#define HS_MATCH_FLAG_ADJUSTED 1U + +/** \brief Bitmask of all valid Hyperscan flags. */ +#define HS_FLAG_ALL ( HS_FLAG_CASELESS \ + | HS_FLAG_DOTALL \ + | HS_FLAG_MULTILINE \ + | HS_FLAG_UTF8 \ + | HS_FLAG_UCP \ + | HS_FLAG_PREFILTER \ + | HS_FLAG_SINGLEMATCH \ + | HS_FLAG_ALLOWEMPTY \ + | HS_FLAG_SOM_LEFTMOST) + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif diff --git a/regex/hs_runtime.h b/regex/hs_runtime.h new file mode 100644 index 000000000..c757aa2c9 --- /dev/null +++ b/regex/hs_runtime.h @@ -0,0 +1,683 @@ +/* + * Copyright (c) 2015-2018, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef HS_RUNTIME_H_ +#define HS_RUNTIME_H_ + +#ifndef __KERNEL__ +#include +#else +#include +#endif + +/** + * @file + * @brief The Hyperscan runtime API definition. + * + * Hyperscan is a high speed regular expression engine. + * + * This header contains functions for using compiled Hyperscan databases for + * scanning data at runtime. + */ + +#include "hs_common.h" +//#include "fw/str.h" + +#ifdef __cplusplus +extern "C" +{ +#endif + +/** + * Definition of the stream identifier type. + */ +struct hs_stream; + +/** + * The stream identifier returned by @ref hs_open_stream(). + */ +typedef struct hs_stream hs_stream_t; + +struct hs_scratch; + +/** + * A Hyperscan scratch space. + */ +typedef struct hs_scratch hs_scratch_t; + +/** + * Definition of the match event callback function type. + * + * A callback function matching the defined type must be provided by the + * application calling the @ref hs_scan(), @ref hs_scan_vector() or @ref + * hs_scan_stream() functions (or other streaming calls which can produce + * matches). + * + * This callback function will be invoked whenever a match is located in the + * target data during the execution of a scan. The details of the match are + * passed in as parameters to the callback function, and the callback function + * should return a value indicating whether or not matching should continue on + * the target data. If no callbacks are desired from a scan call, NULL may be + * provided in order to suppress match production. + * + * This callback function should not attempt to call Hyperscan API functions on + * the same stream nor should it attempt to reuse the scratch space allocated + * for the API calls that caused it to be triggered. Making another call to the + * Hyperscan library with completely independent parameters should work (for + * example, scanning a different database in a new stream and with new scratch + * space), but reusing data structures like stream state and/or scratch space + * will produce undefined behavior. + * + * @param id + * The ID number of the expression that matched. If the expression was a + * single expression compiled with @ref hs_compile(), this value will be + * zero. + * + * @param from + * - If a start of match flag is enabled for the current pattern, this + * argument will be set to the start of match for the pattern assuming + * that that start of match value lies within the current 'start of match + * horizon' chosen by one of the SOM_HORIZON mode flags. + + * - If the start of match value lies outside this horizon (possible only + * when the SOM_HORIZON value is not @ref HS_MODE_SOM_HORIZON_LARGE), + * the @p from value will be set to @ref HS_OFFSET_PAST_HORIZON. + + * - This argument will be set to zero if the Start of Match flag is not + * enabled for the given pattern. + * + * @param to + * The offset after the last byte that matches the expression. + * + * @param flags + * This is provided for future use and is unused at present. + * + * @param context + * The pointer supplied by the user to the @ref hs_scan(), @ref + * hs_scan_vector() or @ref hs_scan_stream() function. + * + * @return + * Non-zero if the matching should cease, else zero. If scanning is + * performed in streaming mode and a non-zero value is returned, any + * subsequent calls to @ref hs_scan_stream() for that stream will + * immediately return with @ref HS_SCAN_TERMINATED. + */ +typedef int (HS_CDECL *match_event_handler)(unsigned int id, + unsigned long long from, + unsigned long long to, + unsigned int flags, + void *context); + +/** + * Open and initialise a stream. + * + * @param db + * A compiled pattern database. + * + * @param flags + * Flags modifying the behaviour of the stream. This parameter is provided + * for future use and is unused at present. + * + * @param stream + * On success, a pointer to the generated @ref hs_stream_t will be + * returned; NULL on failure. + * + * @return + * @ref HS_SUCCESS on success, other values on failure. + */ +hs_error_t HS_CDECL hs_open_stream(const hs_database_t *db, unsigned int flags, + hs_stream_t **stream); + +/** + * Write data to be scanned to the opened stream. + * + * This is the function call in which the actual pattern matching takes place + * as data is written to the stream. Matches will be returned via the @ref + * match_event_handler callback supplied. + * + * @param id + * The stream ID (returned by @ref hs_open_stream()) to which the data + * will be written. + * + * @param data + * Pointer to the data to be scanned. + * + * @param length + * The number of bytes to scan. + * + * @param flags + * Flags modifying the behaviour of the stream. This parameter is provided + * for future use and is unused at present. + * + * @param scratch + * A per-thread scratch space allocated by @ref hs_alloc_scratch(). + * + * @param onEvent + * Pointer to a match event callback function. If a NULL pointer is given, + * no matches will be returned. + * + * @param ctxt + * The user defined pointer which will be passed to the callback function + * when a match occurs. + * + * @return + * Returns @ref HS_SUCCESS on success; @ref HS_SCAN_TERMINATED if the + * match callback indicated that scanning should stop; other values on + * error. + */ +hs_error_t HS_CDECL hs_scan_stream(hs_stream_t *id, const char *data, + unsigned int length, unsigned int flags, + hs_scratch_t *scratch, + match_event_handler onEvent, void *ctxt); + +/** + * Close a stream. + * + * This function completes matching on the given stream and frees the memory + * associated with the stream state. After this call, the stream pointed to by + * @p id is invalid and can no longer be used. To reuse the stream state after + * completion, rather than closing it, the @ref hs_reset_stream function can be + * used. + * + * This function must be called for any stream created with @ref + * hs_open_stream(), even if scanning has been terminated by a non-zero return + * from the match callback function. + * + * Note: This operation may result in matches being returned (via calls to the + * match event callback) for expressions anchored to the end of the data stream + * (for example, via the use of the `$` meta-character). If these matches are + * not desired, NULL may be provided as the @ref match_event_handler callback. + * + * If NULL is provided as the @ref match_event_handler callback, it is + * permissible to provide a NULL scratch. + * + * @param id + * The stream ID returned by @ref hs_open_stream(). + * + * @param scratch + * A per-thread scratch space allocated by @ref hs_alloc_scratch(). This is + * allowed to be NULL only if the @p onEvent callback is also NULL. + * + * @param onEvent + * Pointer to a match event callback function. If a NULL pointer is given, + * no matches will be returned. + * + * @param ctxt + * The user defined pointer which will be passed to the callback function + * when a match occurs. + * + * @return + * Returns @ref HS_SUCCESS on success, other values on failure. + */ +hs_error_t HS_CDECL hs_close_stream(hs_stream_t *id, hs_scratch_t *scratch, + match_event_handler onEvent, void *ctxt); + +/** + * Reset a stream to an initial state. + * + * Conceptually, this is equivalent to performing @ref hs_close_stream() on the + * given stream, followed by a @ref hs_open_stream(). This new stream replaces + * the original stream in memory, avoiding the overhead of freeing the old + * stream and allocating the new one. + * + * Note: This operation may result in matches being returned (via calls to the + * match event callback) for expressions anchored to the end of the original + * data stream (for example, via the use of the `$` meta-character). If these + * matches are not desired, NULL may be provided as the @ref match_event_handler + * callback. + * + * Note: the stream will also be tied to the same database. + * + * @param id + * The stream (as created by @ref hs_open_stream()) to be replaced. + * + * @param flags + * Flags modifying the behaviour of the stream. This parameter is provided + * for future use and is unused at present. + * + * @param scratch + * A per-thread scratch space allocated by @ref hs_alloc_scratch(). This is + * allowed to be NULL only if the @p onEvent callback is also NULL. + * + * @param onEvent + * Pointer to a match event callback function. If a NULL pointer is given, + * no matches will be returned. + * + * @param context + * The user defined pointer which will be passed to the callback function + * when a match occurs. + * + * @return + * @ref HS_SUCCESS on success, other values on failure. + */ +hs_error_t HS_CDECL hs_reset_stream(hs_stream_t *id, unsigned int flags, + hs_scratch_t *scratch, + match_event_handler onEvent, void *context); + +/** + * Duplicate the given stream. The new stream will have the same state as the + * original including the current stream offset. + * + * @param to_id + * On success, a pointer to the new, copied @ref hs_stream_t will be + * returned; NULL on failure. + * + * @param from_id + * The stream (as created by @ref hs_open_stream()) to be copied. + * + * @return + * @ref HS_SUCCESS on success, other values on failure. + */ +hs_error_t HS_CDECL hs_copy_stream(hs_stream_t **to_id, + const hs_stream_t *from_id); + +/** + * Duplicate the given 'from' stream state onto the 'to' stream. The 'to' stream + * will first be reset (reporting any EOD matches if a non-NULL @p onEvent + * callback handler is provided). + * + * Note: the 'to' stream and the 'from' stream must be open against the same + * database. + * + * @param to_id + * On success, a pointer to the new, copied @ref hs_stream_t will be + * returned; NULL on failure. + * + * @param from_id + * The stream (as created by @ref hs_open_stream()) to be copied. + * + * @param scratch + * A per-thread scratch space allocated by @ref hs_alloc_scratch(). This is + * allowed to be NULL only if the @p onEvent callback is also NULL. + * + * @param onEvent + * Pointer to a match event callback function. If a NULL pointer is given, + * no matches will be returned. + * + * @param context + * The user defined pointer which will be passed to the callback function + * when a match occurs. + * + * @return + * @ref HS_SUCCESS on success, other values on failure. + */ +hs_error_t HS_CDECL hs_reset_and_copy_stream(hs_stream_t *to_id, + const hs_stream_t *from_id, + hs_scratch_t *scratch, + match_event_handler onEvent, + void *context); + +/** + * Creates a compressed representation of the provided stream in the buffer + * provided. This compressed representation can be converted back into a stream + * state by using @ref hs_expand_stream() or @ref hs_reset_and_expand_stream(). + * The size of the compressed representation will be placed into @p used_space. + * + * If there is not sufficient space in the buffer to hold the compressed + * representation, @ref HS_INSUFFICIENT_SPACE will be returned and @p used_space + * will be populated with the amount of space required. + * + * Note: this function does not close the provided stream, you may continue to + * use the stream or to free it with @ref hs_close_stream(). + * + * @param stream + * The stream (as created by @ref hs_open_stream()) to be compressed. + * + * @param buf + * Buffer to write the compressed representation into. Note: if the call is + * just being used to determine the amount of space required, it is allowed + * to pass NULL here and @p buf_space as 0. + * + * @param buf_space + * The number of bytes in @p buf. If buf_space is too small, the call will + * fail with @ref HS_INSUFFICIENT_SPACE. + * + * @param used_space + * Pointer to where the amount of used space will be written to. The used + * buffer space is always less than or equal to @p buf_space. If the call + * fails with @ref HS_INSUFFICIENT_SPACE, this pointer will be used to + * write out the amount of buffer space required. + * + * @return + * @ref HS_SUCCESS on success, @ref HS_INSUFFICIENT_SPACE if the provided + * buffer is too small. + */ +hs_error_t HS_CDECL hs_compress_stream(const hs_stream_t *stream, char *buf, + size_t buf_space, size_t *used_space); + +/** + * Decompresses a compressed representation created by @ref hs_compress_stream() + * into a new stream. + * + * Note: @p buf must correspond to a complete compressed representation created + * by @ref hs_compress_stream() of a stream that was opened against @p db. It is + * not always possible to detect misuse of this API and behaviour is undefined + * if these properties are not satisfied. + * + * @param db + * The compiled pattern database that the compressed stream was opened + * against. + * + * @param stream + * On success, a pointer to the expanded @ref hs_stream_t will be + * returned; NULL on failure. + * + * @param buf + * A compressed representation of a stream. These compressed forms are + * created by @ref hs_compress_stream(). + * + * @param buf_size + * The size in bytes of the compressed representation. + * + * @return + * @ref HS_SUCCESS on success, other values on failure. + */ +hs_error_t HS_CDECL hs_expand_stream(const hs_database_t *db, + hs_stream_t **stream, const char *buf, + size_t buf_size); + +/** + * Decompresses a compressed representation created by @ref hs_compress_stream() + * on top of the 'to' stream. The 'to' stream will first be reset (reporting + * any EOD matches if a non-NULL @p onEvent callback handler is provided). + * + * Note: the 'to' stream must be opened against the same database as the + * compressed stream. + * + * Note: @p buf must correspond to a complete compressed representation created + * by @ref hs_compress_stream() of a stream that was opened against @p db. It is + * not always possible to detect misuse of this API and behaviour is undefined + * if these properties are not satisfied. + * + * @param to_stream + * A pointer to a valid stream state. A pointer to the expanded @ref + * hs_stream_t will be returned; NULL on failure. + * + * @param buf + * A compressed representation of a stream. These compressed forms are + * created by @ref hs_compress_stream(). + * + * @param buf_size + * The size in bytes of the compressed representation. + * + * @param scratch + * A per-thread scratch space allocated by @ref hs_alloc_scratch(). This is + * allowed to be NULL only if the @p onEvent callback is also NULL. + * + * @param onEvent + * Pointer to a match event callback function. If a NULL pointer is given, + * no matches will be returned. + * + * @param context + * The user defined pointer which will be passed to the callback function + * when a match occurs. + * + * @return + * @ref HS_SUCCESS on success, other values on failure. + */ +hs_error_t HS_CDECL hs_reset_and_expand_stream(hs_stream_t *to_stream, + const char *buf, size_t buf_size, + hs_scratch_t *scratch, + match_event_handler onEvent, + void *context); + +/** + * The block (non-streaming) regular expression scanner. + * + * This is the function call in which the actual pattern matching takes place + * for block-mode pattern databases. + * + * @param db + * A compiled pattern database. + * + * @param data + * Pointer to the data to be scanned. + * + * @param length + * The number of bytes to scan. + * + * @param flags + * Flags modifying the behaviour of this function. This parameter is + * provided for future use and is unused at present. + * + * @param scratch + * A per-thread scratch space allocated by @ref hs_alloc_scratch() for this + * database. + * + * @param onEvent + * Pointer to a match event callback function. If a NULL pointer is given, + * no matches will be returned. + * + * @param context + * The user defined pointer which will be passed to the callback function. + * + * @return + * Returns @ref HS_SUCCESS on success; @ref HS_SCAN_TERMINATED if the + * match callback indicated that scanning should stop; other values on + * error. + */ +hs_error_t HS_CDECL hs_scan(const hs_database_t *db, const char *data, + unsigned int length, unsigned int flags, + hs_scratch_t *scratch, match_event_handler onEvent, + void *context); + +/** + * The vectored regular expression scanner. + * + * This is the function call in which the actual pattern matching takes place + * for vectoring-mode pattern databases. + * + * @param db + * A compiled pattern database. + * + * @param data + * An array of pointers to the data blocks to be scanned. + * + * @param length + * An array of lengths (in bytes) of each data block to scan. + * + * @param count + * Number of data blocks to scan. This should correspond to the size of + * of the @p data and @p length arrays. + * + * @param flags + * Flags modifying the behaviour of this function. This parameter is + * provided for future use and is unused at present. + * + * @param scratch + * A per-thread scratch space allocated by @ref hs_alloc_scratch() for + * this database. + * + * @param onEvent + * Pointer to a match event callback function. If a NULL pointer is given, + * no matches will be returned. + * + * @param context + * The user defined pointer which will be passed to the callback function. + * + * @return + * Returns @ref HS_SUCCESS on success; @ref HS_SCAN_TERMINATED if the match + * callback indicated that scanning should stop; other values on error. + */ + +hs_error_t HS_CDECL hs_scan_vector(const hs_database_t *db, + const char *const *data, + const unsigned int *length, + unsigned int count, unsigned int flags, + hs_scratch_t *scratch, + match_event_handler onEvent, void *context); + + + + +/** + * The vectored regular expression scanner. + * + * This is the function call in which the actual pattern matching takes place + * for vectoring-mode pattern databases. + * + * @param db + * A compiled pattern database. + * + * @param data + * TfwStr chunked string. + * + * @param flags + * Flags modifying the behaviour of this function. This parameter is + * provided for future use and is unused at present. + * + * @param scratch + * A per-thread scratch space allocated by @ref hs_alloc_scratch() for + * this database. + * + * @param onEvent + * Pointer to a match event callback function. If a NULL pointer is given, + * no matches will be returned. + * + * @param context + * The user defined pointer which will be passed to the callback function. + * + * @return + * Returns @ref HS_SUCCESS on success; @ref HS_SCAN_TERMINATED if the match + * callback indicated that scanning should stop; other values on error. + */ + +hs_error_t HS_CDECL hs_scan_tfwstr(const hs_database_t *db, + const void *data,/*TfwStr*/ + unsigned int flags, + hs_scratch_t *scratch, + match_event_handler onEvent, void *context); + +/** + * Allocate a "scratch" space for use by Hyperscan. + * + * This is required for runtime use, and one scratch space per thread, or + * concurrent caller, is required. Any allocator callback set by @ref + * hs_set_scratch_allocator() or @ref hs_set_allocator() will be used by this + * function. + * + * @param db + * The database, as produced by @ref hs_compile(). + * + * @param scratch + * On first allocation, a pointer to NULL should be provided so a new + * scratch can be allocated. If a scratch block has been previously + * allocated, then a pointer to it should be passed back in to see if it + * is valid for this database block. If a new scratch block is required, + * the original will be freed and the new one returned, otherwise the + * previous scratch block will be returned. On success, the scratch block + * will be suitable for use with the provided database in addition to any + * databases that original scratch space was suitable for. + * + * @return + * @ref HS_SUCCESS on successful allocation; @ref HS_NOMEM if the + * allocation fails. Other errors may be returned if invalid parameters + * are specified. + */ +hs_error_t HS_CDECL hs_alloc_scratch(const hs_database_t *db, + hs_scratch_t **scratch); + +/** + * Allocate a scratch space that is a clone of an existing scratch space. + * + * This is useful when multiple concurrent threads will be using the same set + * of compiled databases, and another scratch space is required. Any allocator + * callback set by @ref hs_set_scratch_allocator() or @ref hs_set_allocator() + * will be used by this function. + * + * @param src + * The existing @ref hs_scratch_t to be cloned. + * + * @param dest + * A pointer to the new scratch space will be returned here. + * + * @return + * @ref HS_SUCCESS on success; @ref HS_NOMEM if the allocation fails. + * Other errors may be returned if invalid parameters are specified. + */ +hs_error_t HS_CDECL hs_clone_scratch(const hs_scratch_t *src, + hs_scratch_t **dest); + +/** + * Like @ref hs_clone_scratch() but writes to a preallocated buffer. + * + * @param src + * The existing @ref hs_scratch_t to be cloned. + * + * @param dest + * A pointer where scratch space should be initialized. + * + * @return + * @ref HS_SUCCESS on success; + * @ref HS_INVALID if dest is NULL or badly aligned. + */ +hs_error_t HS_CDECL hs_init_scratch(const hs_scratch_t *src, hs_scratch_t *dest); + +/** + * Provides the size of the given scratch space. + * + * @param scratch + * A per-thread scratch space allocated by @ref hs_alloc_scratch() or @ref + * hs_clone_scratch(). + * + * @param scratch_size + * On success, the size of the scratch space in bytes is placed in this + * parameter. + * + * @return + * @ref HS_SUCCESS on success, other values on failure. + */ +hs_error_t HS_CDECL hs_scratch_size(const hs_scratch_t *scratch, + size_t *scratch_size); + +/** + * Free a scratch block previously allocated by @ref hs_alloc_scratch() or @ref + * hs_clone_scratch(). + * + * The free callback set by @ref hs_set_scratch_allocator() or @ref + * hs_set_allocator() will be used by this function. + * + * @param scratch + * The scratch block to be freed. NULL may also be safely provided. + * + * @return + * @ref HS_SUCCESS on success, other values on failure. + */ +hs_error_t HS_CDECL hs_free_scratch(hs_scratch_t *scratch); + +/** + * Callback 'from' return value, indicating that the start of this match was + * too early to be tracked with the requested SOM_HORIZON precision. + */ +#define HS_OFFSET_PAST_HORIZON (~0ULL) + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* HS_RUNTIME_H_ */ diff --git a/regex/hs_version.c b/regex/hs_version.c new file mode 100644 index 000000000..04cf46f3f --- /dev/null +++ b/regex/hs_version.c @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "ue2common.h" +#include "hs_common.h" +#include "hs_version.h" + +HS_PUBLIC_API +const char * HS_CDECL hs_version(void) { + return HS_VERSION_STRING; +} diff --git a/regex/hwlm/hwlm.c b/regex/hwlm/hwlm.c new file mode 100644 index 000000000..24aa26a4c --- /dev/null +++ b/regex/hwlm/hwlm.c @@ -0,0 +1,247 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Hamster Wheel Literal Matcher: runtime. + */ +#include "hwlm.h" +#include "hwlm_internal.h" +#include "noodle_engine.h" +#include "scratch.h" +#include "ue2common.h" +#include "fdr/fdr.h" +#include "nfa/accel.h" +#include "nfa/shufti.h" +#include "nfa/truffle.h" +#include "nfa/vermicelli.h" +#ifndef __KERNEL__ +#include +#else +#include +#endif + +#define MIN_ACCEL_LEN_BLOCK 16 +#define MIN_ACCEL_LEN_STREAM 16 + +static really_inline +const u8 *run_hwlm_accel(const union AccelAux *aux, const u8 *ptr, + const u8 *end) { + switch (aux->accel_type) { + case ACCEL_VERM: + DEBUG_PRINTF("single vermicelli for 0x%02hhx\n", aux->verm.c); + return vermicelliExec(aux->verm.c, 0, ptr, end); + case ACCEL_VERM_NOCASE: + DEBUG_PRINTF("single vermicelli-nocase for 0x%02hhx\n", aux->verm.c); + return vermicelliExec(aux->verm.c, 1, ptr, end); + case ACCEL_DVERM: + DEBUG_PRINTF("double vermicelli for 0x%02hhx%02hhx\n", aux->dverm.c1, + aux->dverm.c2); + return vermicelliDoubleExec(aux->dverm.c1, aux->dverm.c2, 0, ptr, end); + case ACCEL_DVERM_NOCASE: + DEBUG_PRINTF("double vermicelli-nocase for 0x%02hhx%02hhx\n", + aux->dverm.c1, aux->dverm.c2); + return vermicelliDoubleExec(aux->dverm.c1, aux->dverm.c2, 1, ptr, end); + case ACCEL_SHUFTI: + DEBUG_PRINTF("single shufti\n"); + return shuftiExec(aux->shufti.lo, aux->shufti.hi, ptr, end); + case ACCEL_TRUFFLE: + DEBUG_PRINTF("truffle\n"); + return truffleExec(aux->truffle.mask1, aux->truffle.mask2, ptr, end); + default: + /* no acceleration, fall through and return current ptr */ + DEBUG_PRINTF("no accel; %u\n", (int)aux->accel_type); + assert(aux->accel_type == ACCEL_NONE); + return ptr; + } +} + +static really_inline +void do_accel_block(const union AccelAux *aux, const u8 *buf, size_t len, + size_t *start) { + + if (len - *start < MIN_ACCEL_LEN_BLOCK) { + return; + } + + const u8 *ptr = buf + *start; + const u8 *end = buf + len; + const u8 offset = aux->generic.offset; + ptr = run_hwlm_accel(aux, ptr, end); + + if (offset) { + ptr -= offset; + if (ptr < buf) { + ptr = buf; + } + } + assert(ptr >= buf); + *start = ptr - buf; +} + +static really_inline +int inaccurate_accel(u8 type) { + /* accels which don't always catch up to the boundary + * DSHUFTI is also inaccurate but it is not used by the hamsters */ + return type == ACCEL_DVERM_NOCASE || type == ACCEL_DVERM; +} + +static never_inline +void do_accel_streaming(const union AccelAux *aux, const u8 *hbuf, size_t hlen, + const u8 *buf, size_t len, size_t *start) { + if (aux->accel_type == ACCEL_NONE || len - *start < MIN_ACCEL_LEN_STREAM) { + return; + } + + const u8 offset = aux->generic.offset; + + DEBUG_PRINTF("using accel %hhu offset %hhu\n", aux->accel_type, offset); + + // Scan history buffer, but only if the start offset (which always refers to + // buf) is zero. + + if (!*start && hlen) { + const u8 *ptr1 = hbuf; + const u8 *end1 = hbuf + hlen; + if (hlen >= 16) { + ptr1 = run_hwlm_accel(aux, ptr1, end1); + } + + if ((hlen <= 16 || inaccurate_accel(aux->accel_type)) + && end1 != ptr1 && end1 - ptr1 <= 16) { + DEBUG_PRINTF("already scanned %zu/%zu\n", ptr1 - hbuf, hlen); + /* see if we can finish off the history buffer completely */ + u8 ALIGN_DIRECTIVE temp[17]; + ptrdiff_t tlen = end1 - ptr1; + memcpy(temp, ptr1, tlen); + memset(temp + tlen, 0, 17 - tlen); + if (len) { /* for dverm */ + temp[end1 - ptr1] = *buf; + } + + const u8 *tempp = run_hwlm_accel(aux, temp, temp + 17); + + if (tempp - temp >= tlen) { + ptr1 = end1; + } + DEBUG_PRINTF("got %zu\n", tempp - temp); + } + + if (ptr1 != end1) { + DEBUG_PRINTF("bailing in history\n"); + return; + } + } + + DEBUG_PRINTF("scanning main buffer, start=%zu, len=%zu\n", *start, len); + + const u8 *ptr2 = buf + *start; + const u8 *end2 = buf + len; + + const u8 *found = run_hwlm_accel(aux, ptr2, end2); + + if (found >= ptr2 + offset) { + size_t delta = found - offset - ptr2; + DEBUG_PRINTF("got %zu/%zu in 2nd buffer\n", delta, len); + *start += delta; + } else if (hlen) { + UNUSED size_t remaining = offset + ptr2 - found; + DEBUG_PRINTF("got %zu/%zu remaining in 1st buffer\n", remaining, hlen); + } +} + +hwlm_error_t hwlmExec(const struct HWLM *t, const u8 *buf, size_t len, + size_t start, HWLMCallback cb, struct hs_scratch *scratch, + hwlm_group_t groups) { + assert(t); + + DEBUG_PRINTF("buf len=%zu, start=%zu, groups=%llx\n", len, start, groups); + if (!groups) { + DEBUG_PRINTF("groups all off\n"); + return HWLM_SUCCESS; + } + + assert(start < len); + + if (t->type == HWLM_ENGINE_NOOD) { + DEBUG_PRINTF("calling noodExec\n"); + return noodExec(HWLM_C_DATA(t), buf, len, start, cb, scratch); + } + + assert(t->type == HWLM_ENGINE_FDR); + const union AccelAux *aa = &t->accel0; + if ((groups & ~t->accel1_groups) == 0) { + DEBUG_PRINTF("using hq accel %hhu\n", t->accel1.accel_type); + aa = &t->accel1; + } + do_accel_block(aa, buf, len, &start); + DEBUG_PRINTF("calling frankie (groups=%08llx, start=%zu)\n", groups, start); + return fdrExec(HWLM_C_DATA(t), buf, len, start, cb, scratch, groups); +} + +hwlm_error_t hwlmExecStreaming(const struct HWLM *t, size_t len, size_t start, + HWLMCallback cb, struct hs_scratch *scratch, + hwlm_group_t groups) { + assert(t); + assert(scratch); + + const u8 *hbuf = scratch->core_info.hbuf; + const size_t hlen = scratch->core_info.hlen; + const u8 *buf = scratch->core_info.buf; + + DEBUG_PRINTF("hbuf len=%zu, buf len=%zu, start=%zu, groups=%llx\n", hlen, + len, start, groups); + + if (!groups) { + return HWLM_SUCCESS; + } + + assert(start < len); + + if (t->type == HWLM_ENGINE_NOOD) { + DEBUG_PRINTF("calling noodExec\n"); + // If we've been handed a start offset, we can use a block mode scan at + // that offset. + if (start) { + return noodExec(HWLM_C_DATA(t), buf, len, start, cb, scratch); + } else { + return noodExecStreaming(HWLM_C_DATA(t), hbuf, hlen, buf, len, cb, + scratch); + } + } + + assert(t->type == HWLM_ENGINE_FDR); + const union AccelAux *aa = &t->accel0; + if ((groups & ~t->accel1_groups) == 0) { + DEBUG_PRINTF("using hq accel %hhu\n", t->accel1.accel_type); + aa = &t->accel1; + } + do_accel_streaming(aa, hbuf, hlen, buf, len, &start); + DEBUG_PRINTF("calling frankie (groups=%08llx, start=%zu)\n", groups, start); + return fdrExecStreaming(HWLM_C_DATA(t), hbuf, hlen, buf, len, start, cb, + scratch, groups); +} diff --git a/regex/hwlm/hwlm.h b/regex/hwlm/hwlm.h new file mode 100644 index 000000000..224ecf6bf --- /dev/null +++ b/regex/hwlm/hwlm.h @@ -0,0 +1,145 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Hamster Wheel Literal Matcher: runtime API. + */ + +#ifndef HWLM_H +#define HWLM_H + +#include "ue2common.h" + +#ifdef __cplusplus +extern "C" +{ +#endif + +/** \brief Error return type for exec functions. */ +typedef int hwlm_error_t; + +/** \brief Type representing a set of groups as a bitmap. */ +typedef u64a hwlm_group_t; + +/** \brief HWLM callback return type. */ +typedef hwlm_group_t hwlmcb_rv_t; + +/** \brief Value representing all possible literal groups. */ +#define HWLM_ALL_GROUPS ((hwlm_group_t)~0ULL) + +/** \brief Callback return value indicating that we should continue matching. */ +#define HWLM_CONTINUE_MATCHING HWLM_ALL_GROUPS + +/** \brief Callback return value indicating that we should halt matching. */ +#define HWLM_TERMINATE_MATCHING 0 + +/** \brief Matching finished without being terminated by the user. */ +#define HWLM_SUCCESS 0 + +/** \brief The user terminated matching by returning HWLM_TERMINATE_MATCHING + * from the match callback. */ +#define HWLM_TERMINATED 1 + +/** \brief An error occurred during matching. + * + * This should only be used if an unsupported engine was called (like one + * designed for a different architecture). */ +#define HWLM_ERROR_UNKNOWN 2 + +/** \brief Max length of the literal passed to HWLM. */ +#define HWLM_LITERAL_MAX_LEN 8 + +struct hs_scratch; +struct HWLM; + +/** \brief The type for an HWLM callback. + * + * This callback receives an end-of-match offset, the ID of the match and + * the context pointer that was passed into \ref hwlmExec or + * \ref hwlmExecStreaming. + * + * A callback return of \ref HWLM_TERMINATE_MATCHING will stop matching. + * + * A callback return of \ref HWLM_CONTINUE_MATCHING continues matching. + * + * An arbitrary group mask may be given as the return value. This will be taken + * as a hint by the underlying engine that only literals with groups + * overlapping the provided mask need to be reported. + * + * The underlying engine may choose not to report a match if there is no group + * belonging to the literal which was active at the when the end match location + * was first reached. + */ +typedef hwlmcb_rv_t (*HWLMCallback)(size_t end, u32 id, + struct hs_scratch *scratch); + +/** \brief Match strings in table. + * + * If a match occurs, the callback function given will be called with the index + * of the last character in the string and the \p context (passed through + * without interpretation). + * + * Returns \ref HWLM_TERMINATED if scanning is cancelled due to the callback + * returning \ref HWLM_TERMINATE_MATCHING. + * + * \p start is the first offset at which a match may start. Note: match + * starts may include masks overhanging the main literal. + * + * The underlying engine may choose not to report any match which starts before + * the first possible match of a literal which is in the initial group mask. + */ +hwlm_error_t hwlmExec(const struct HWLM *tab, const u8 *buf, size_t len, + size_t start, HWLMCallback callback, + struct hs_scratch *scratch, hwlm_group_t groups); + +/** \brief As for \ref hwlmExec, but a streaming case across two buffers. + * + * \p len is the length of the main buffer to be scanned. + * + * \p start is an advisory hint representing the first offset at which a match + * may start. Some underlying literal matches may not respect it. Note: match + * starts may include masks overhanging the main literal. + * + * \p scratch is used to access the history buffer, history length and + * the main buffer. + * + * Two buffers/lengths are provided. Matches that occur entirely within + * the history buffer will not be reported by this function. The offsets + * reported for the main buffer are relative to the start of that buffer (a + * match at byte 10 of the main buffer is reported as 10). Matches that start + * in the history buffer will have starts reported with 'negative' values. + */ +hwlm_error_t hwlmExecStreaming(const struct HWLM *tab, size_t len, size_t start, + HWLMCallback callback, + struct hs_scratch *scratch, hwlm_group_t groups); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif diff --git a/regex/hwlm/hwlm_internal.h b/regex/hwlm/hwlm_internal.h new file mode 100644 index 000000000..e35c84fdf --- /dev/null +++ b/regex/hwlm/hwlm_internal.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2015, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Hamster Wheel Literal Matcher: data structures. + */ + +#ifndef HWLM_INTERNAL_H +#define HWLM_INTERNAL_H + +#include "hwlm.h" +#include "ue2common.h" +#include "nfa/accel.h" + +/** \brief Underlying engine is FDR. */ +#define HWLM_ENGINE_FDR 12 + +/** \brief Underlying engine is Noodle. */ +#define HWLM_ENGINE_NOOD 16 + +/** \brief Main Hamster Wheel Literal Matcher header. Followed by + * engine-specific structure. */ +struct HWLM { + u8 type; /**< HWLM_ENGINE_NOOD or HWLM_ENGINE_FDR */ + hwlm_group_t accel1_groups; /**< accelerable groups. */ + union AccelAux accel1; /**< used if group mask is subset of accel1_groups */ + union AccelAux accel0; /**< fallback accel scheme */ +}; + +/** \brief Fetch a const pointer to the underlying engine. */ +#define HWLM_C_DATA(p) ((const void *)((const char *)(p) \ + + ROUNDUP_CL(sizeof(struct HWLM)))) + +/** \brief Fetch a pointer to the underlying engine. */ +#define HWLM_DATA(p) ((void *)((char *)(p) + ROUNDUP_CL(sizeof(struct HWLM)))) + +#endif diff --git a/regex/hwlm/noodle_engine.c b/regex/hwlm/noodle_engine.c new file mode 100644 index 000000000..0af2cb6f0 --- /dev/null +++ b/regex/hwlm/noodle_engine.c @@ -0,0 +1,447 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Noodle literal matcher: runtime. + */ +#include "hwlm.h" +#include "noodle_engine.h" +#include "noodle_internal.h" +#include "scratch.h" +#include "ue2common.h" +#include "util/arch.h" +#include "util/bitutils.h" +#include "util/compare.h" +#include "util/intrinsics.h" +#include "util/join.h" +#include "util/masked_move.h" +#include "util/partial_store.h" +#include "util/simd_utils.h" + +#ifndef __KERNEL__ +#include +#include +#include +#else +#include +#include +#endif + +/** \brief Noodle runtime context. */ +struct cb_info { + HWLMCallback cb; //!< callback function called on match + u32 id; //!< ID to pass to callback on match + struct hs_scratch *scratch; //!< scratch to pass to callback + size_t offsetAdj; //!< used in streaming mode +}; + +#if defined(HAVE_AVX512) +#define CHUNKSIZE 64 +#define MASK_TYPE m512 +#define Z_BITS 64 +#define Z_TYPE u64a +#elif defined(HAVE_AVX2) +#define CHUNKSIZE 32 +#define MASK_TYPE m256 +#define Z_BITS 32 +#define Z_TYPE u32 +#else +#define CHUNKSIZE 16 +#define MASK_TYPE m128 +#define Z_BITS 32 +#define Z_TYPE u32 +#endif + + +#define RETURN_IF_TERMINATED(x) \ + { \ + if ((x) == HWLM_TERMINATED) { \ + return HWLM_TERMINATED; \ + } \ + } + +#define SINGLE_ZSCAN() \ + do { \ + while (unlikely(z)) { \ + Z_TYPE pos = JOIN(findAndClearLSB_, Z_BITS)(&z); \ + size_t matchPos = d - buf + pos; \ + DEBUG_PRINTF("match pos %zu\n", matchPos); \ + hwlmcb_rv_t rv = final(n, buf, len, 1, cbi, matchPos); \ + RETURN_IF_TERMINATED(rv); \ + } \ + } while (0) + +#define DOUBLE_ZSCAN() \ + do { \ + while (unlikely(z)) { \ + Z_TYPE pos = JOIN(findAndClearLSB_, Z_BITS)(&z); \ + size_t matchPos = d - buf + pos - 1; \ + DEBUG_PRINTF("match pos %zu\n", matchPos); \ + hwlmcb_rv_t rv = final(n, buf, len, 0, cbi, matchPos); \ + RETURN_IF_TERMINATED(rv); \ + } \ + } while (0) + +static really_inline +u8 caseClear8(u8 x, bool noCase) { + return (u8)(noCase ? (x & (u8)0xdf) : x); +} + +// Make sure the rest of the string is there. The single character scanner +// is used only for single chars with case insensitivity used correctly, +// so it can go straight to the callback if we get this far. +static really_inline +hwlm_error_t final(const struct noodTable *n, const u8 *buf, UNUSED size_t len, + char single, const struct cb_info *cbi, size_t pos) { + if (single) { + if (n->msk_len == 1) { + goto match; + } + } + assert(len >= n->msk_len); + u64a v = + partial_load_u64a(buf + pos + n->key_offset - n->msk_len, n->msk_len); + DEBUG_PRINTF("v %016llx msk %016llx cmp %016llx\n", v, n->msk, n->cmp); + if ((v & n->msk) != n->cmp) { + /* mask didn't match */ + return HWLM_SUCCESS; + } + +match: + pos -= cbi->offsetAdj; + DEBUG_PRINTF("match @ %zu\n", pos + n->key_offset); + hwlmcb_rv_t rv = cbi->cb(pos + n->key_offset - 1, cbi->id, cbi->scratch); + if (rv == HWLM_TERMINATE_MATCHING) { + return HWLM_TERMINATED; + } + return HWLM_SUCCESS; +} + +#if defined(HAVE_AVX512) +#define CHUNKSIZE 64 +#define MASK_TYPE m512 +#include "noodle_engine_avx512.c" +#elif defined(HAVE_AVX2) +#define CHUNKSIZE 32 +#define MASK_TYPE m256 +#include "noodle_engine_avx2.c" +#else +#define CHUNKSIZE 16 +#define MASK_TYPE m128 +#include "noodle_engine_sse.c" +#endif + +static really_inline +hwlm_error_t scanSingleMain(const struct noodTable *n, const u8 *buf, + size_t len, size_t start, bool noCase, + const struct cb_info *cbi) { + + const MASK_TYPE mask1 = getMask(n->key0, noCase); + const MASK_TYPE caseMask = getCaseMask(); + + size_t offset = start + n->msk_len - 1; + size_t end = len; + assert(offset < end); + +#if !defined(HAVE_AVX512) + hwlm_error_t rv; + + if (end - offset < CHUNKSIZE) { + rv = scanSingleShort(n, buf, len, noCase, caseMask, mask1, cbi, offset, + end); + return rv; + } + + if (end - offset == CHUNKSIZE) { + rv = scanSingleUnaligned(n, buf, len, offset, noCase, caseMask, mask1, + cbi, offset, end); + return rv; + } + + uintptr_t data = (uintptr_t)buf; + uintptr_t s2Start = ROUNDUP_N(data + offset, CHUNKSIZE) - data; + uintptr_t last = data + end; + uintptr_t s2End = ROUNDDOWN_N(last, CHUNKSIZE) - data; + uintptr_t s3Start = end - CHUNKSIZE; + + if (offset != s2Start) { + // first scan out to the fast scan starting point + DEBUG_PRINTF("stage 1: -> %zu\n", s2Start); + rv = scanSingleUnaligned(n, buf, len, offset, noCase, caseMask, mask1, + cbi, offset, s2Start); + RETURN_IF_TERMINATED(rv); + } + + if (likely(s2Start != s2End)) { + // scan as far as we can, bounded by the last point this key can + // possibly match + DEBUG_PRINTF("fast: ~ %zu -> %zu\n", s2Start, s2End); + rv = scanSingleFast(n, buf, len, noCase, caseMask, mask1, cbi, s2Start, + s2End); + RETURN_IF_TERMINATED(rv); + } + + // if we are done bail out + if (s2End == len) { + return HWLM_SUCCESS; + } + + DEBUG_PRINTF("stage 3: %zu -> %zu\n", s2End, len); + rv = scanSingleUnaligned(n, buf, len, s3Start, noCase, caseMask, mask1, cbi, + s2End, len); + + return rv; +#else // HAVE_AVX512 + return scanSingle512(n, buf, len, noCase, caseMask, mask1, cbi, offset, + end); +#endif +} + +static really_inline +hwlm_error_t scanDoubleMain(const struct noodTable *n, const u8 *buf, + size_t len, size_t start, bool noCase, + const struct cb_info *cbi) { + // we stop scanning for the key-fragment when the rest of the key can't + // possibly fit in the remaining buffer + size_t end = len - n->key_offset + 2; + + // the first place the key can match + size_t offset = start + n->msk_len - n->key_offset; + + const MASK_TYPE caseMask = getCaseMask(); + const MASK_TYPE mask1 = getMask(n->key0, noCase); + const MASK_TYPE mask2 = getMask(n->key1, noCase); + +#if !defined(HAVE_AVX512) + hwlm_error_t rv; + + if (end - offset < CHUNKSIZE) { + rv = scanDoubleShort(n, buf, len, noCase, caseMask, mask1, mask2, cbi, + offset, end); + return rv; + } + if (end - offset == CHUNKSIZE) { + rv = scanDoubleUnaligned(n, buf, len, offset, noCase, caseMask, mask1, + mask2, cbi, offset, end); + return rv; + } + + uintptr_t data = (uintptr_t)buf; + uintptr_t s2Start = ROUNDUP_N(data + offset, CHUNKSIZE) - data; + uintptr_t s1End = s2Start + 1; + uintptr_t last = data + end; + uintptr_t s2End = ROUNDDOWN_N(last, CHUNKSIZE) - data; + uintptr_t s3Start = end - CHUNKSIZE; + uintptr_t off = offset; + + if (s2Start != off) { + // first scan out to the fast scan starting point plus one char past to + // catch the key on the overlap + DEBUG_PRINTF("stage 1: %zu -> %zu\n", off, s2Start); + rv = scanDoubleUnaligned(n, buf, len, offset, noCase, caseMask, mask1, + mask2, cbi, off, s1End); + RETURN_IF_TERMINATED(rv); + } + off = s1End; + + if (s2Start >= end) { + DEBUG_PRINTF("s2 == mL %zu\n", end); + return HWLM_SUCCESS; + } + + if (likely(s2Start != s2End)) { + // scan as far as we can, bounded by the last point this key can + // possibly match + DEBUG_PRINTF("fast: ~ %zu -> %zu\n", s2Start, s3Start); + rv = scanDoubleFast(n, buf, len, noCase, caseMask, mask1, mask2, cbi, + s2Start, s2End); + RETURN_IF_TERMINATED(rv); + off = s2End; + } + + // if there isn't enough data left to match the key, bail out + if (s2End == end) { + return HWLM_SUCCESS; + } + + DEBUG_PRINTF("stage 3: %zu -> %zu\n", s3Start, end); + rv = scanDoubleUnaligned(n, buf, len, s3Start, noCase, caseMask, mask1, + mask2, cbi, off, end); + + return rv; +#else // AVX512 + return scanDouble512(n, buf, len, noCase, caseMask, mask1, mask2, cbi, + offset, end); +#endif // AVX512 +} + + +static really_inline +hwlm_error_t scanSingleNoCase(const struct noodTable *n, const u8 *buf, + size_t len, size_t start, + const struct cb_info *cbi) { + return scanSingleMain(n, buf, len, start, 1, cbi); +} + +static really_inline +hwlm_error_t scanSingleCase(const struct noodTable *n, const u8 *buf, + size_t len, size_t start, + const struct cb_info *cbi) { + return scanSingleMain(n, buf, len, start, 0, cbi); +} + +// Single-character specialisation, used when keyLen = 1 +static really_inline +hwlm_error_t scanSingle(const struct noodTable *n, const u8 *buf, size_t len, + size_t start, bool noCase, const struct cb_info *cbi) { + if (!ourisalpha(n->key0)) { + noCase = 0; // force noCase off if we don't have an alphabetic char + } + + // kinda ugly, but this forces constant propagation + if (noCase) { + return scanSingleNoCase(n, buf, len, start, cbi); + } else { + return scanSingleCase(n, buf, len, start, cbi); + } +} + + +static really_inline +hwlm_error_t scanDoubleNoCase(const struct noodTable *n, const u8 *buf, + size_t len, size_t start, + const struct cb_info *cbi) { + return scanDoubleMain(n, buf, len, start, 1, cbi); +} + +static really_inline +hwlm_error_t scanDoubleCase(const struct noodTable *n, const u8 *buf, + size_t len, size_t start, + const struct cb_info *cbi) { + return scanDoubleMain(n, buf, len, start, 0, cbi); +} + + +static really_inline +hwlm_error_t scanDouble(const struct noodTable *n, const u8 *buf, size_t len, + size_t start, bool noCase, const struct cb_info *cbi) { + // kinda ugly, but this forces constant propagation + if (noCase) { + return scanDoubleNoCase(n, buf, len, start, cbi); + } else { + return scanDoubleCase(n, buf, len, start, cbi); + } +} + +// main entry point for the scan code +static really_inline +hwlm_error_t scan(const struct noodTable *n, const u8 *buf, size_t len, + size_t start, char single, bool noCase, + const struct cb_info *cbi) { + if (len - start < n->msk_len) { + // can't find string of length keyLen in a shorter buffer + return HWLM_SUCCESS; + } + + if (single) { + return scanSingle(n, buf, len, start, noCase, cbi); + } else { + return scanDouble(n, buf, len, start, noCase, cbi); + } +} + +/** \brief Block-mode scanner. */ +hwlm_error_t noodExec(const struct noodTable *n, const u8 *buf, size_t len, + size_t start, HWLMCallback cb, + struct hs_scratch *scratch) { + assert(n && buf); + + struct cb_info cbi = {cb, n->id, scratch, 0}; + DEBUG_PRINTF("nood scan of %zu bytes for %*s @ %p\n", len, n->msk_len, + (const char *)&n->cmp, buf); + + return scan(n, buf, len, start, n->single, n->nocase, &cbi); +} + +/** \brief Streaming-mode scanner. */ +hwlm_error_t noodExecStreaming(const struct noodTable *n, const u8 *hbuf, + size_t hlen, const u8 *buf, size_t len, + HWLMCallback cb, struct hs_scratch *scratch) { + assert(n); + + if (len + hlen < n->msk_len) { + DEBUG_PRINTF("not enough bytes for a match\n"); + return HWLM_SUCCESS; + } + + struct cb_info cbi = {cb, n->id, scratch, 0}; + DEBUG_PRINTF("nood scan of %zu bytes (%zu hlen) for %*s @ %p\n", len, hlen, + n->msk_len, (const char *)&n->cmp, buf); + + if (hlen && n->msk_len > 1) { + /* + * we have history, so build up a buffer from enough of the history + * buffer plus what we've been given to scan. Since this is relatively + * short, just check against msk+cmp per byte offset for matches. + */ + assert(hbuf); + u8 ALIGN_DIRECTIVE temp_buf[HWLM_LITERAL_MAX_LEN * 2]; + memset(temp_buf, 0, sizeof(temp_buf)); + + assert(n->msk_len); + size_t tl1 = MIN((size_t)n->msk_len - 1, hlen); + size_t tl2 = MIN((size_t)n->msk_len - 1, len); + + assert(tl1 + tl2 <= sizeof(temp_buf)); + assert(tl1 + tl2 >= n->msk_len); + assert(tl1 <= sizeof(u64a)); + assert(tl2 <= sizeof(u64a)); + DEBUG_PRINTF("using %zu bytes of hist and %zu bytes of buf\n", tl1, tl2); + + unaligned_store_u64a(temp_buf, + partial_load_u64a(hbuf + hlen - tl1, tl1)); + unaligned_store_u64a(temp_buf + tl1, partial_load_u64a(buf, tl2)); + + for (size_t i = 0; i <= tl1 + tl2 - n->msk_len; i++) { + u64a v = unaligned_load_u64a(temp_buf + i); + if ((v & n->msk) == n->cmp) { + size_t m_end = -tl1 + i + n->msk_len - 1; + DEBUG_PRINTF("match @ %zu (i %zu)\n", m_end, i); + hwlmcb_rv_t rv = cb(m_end, n->id, scratch); + if (rv == HWLM_TERMINATE_MATCHING) { + return HWLM_TERMINATED; + } + } + } + } + + assert(buf); + + cbi.offsetAdj = 0; + return scan(n, buf, len, 0, n->single, n->nocase, &cbi); +} diff --git a/regex/hwlm/noodle_engine.h b/regex/hwlm/noodle_engine.h new file mode 100644 index 000000000..64422c41f --- /dev/null +++ b/regex/hwlm/noodle_engine.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Noodle literal matcher: runtime API. + */ + +#ifndef NOODLE_ENGINE_H +#define NOODLE_ENGINE_H + +#include "hwlm.h" + +#ifdef __cplusplus +extern "C" +{ +#endif + +struct noodTable; +struct hs_scratch; + +/** \brief Block-mode scanner. */ +hwlm_error_t noodExec(const struct noodTable *n, const u8 *buf, size_t len, + size_t start, HWLMCallback cb, + struct hs_scratch *scratch); + +/** \brief Streaming-mode scanner. */ +hwlm_error_t noodExecStreaming(const struct noodTable *n, const u8 *hbuf, + size_t hlen, const u8 *buf, size_t len, + HWLMCallback cb, struct hs_scratch *scratch); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif diff --git a/regex/hwlm/noodle_engine_avx2.c b/regex/hwlm/noodle_engine_avx2.c new file mode 100644 index 000000000..2a42a3c0e --- /dev/null +++ b/regex/hwlm/noodle_engine_avx2.c @@ -0,0 +1,244 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* noodle scan parts for AVX */ + +static really_inline m256 getMask(u8 c, bool noCase) { + u8 k = caseClear8(c, noCase); + return set32x8(k); +} + +static really_inline m256 getCaseMask(void) { + return set32x8(0xdf); +} + +static really_inline +hwlm_error_t scanSingleUnaligned(const struct noodTable *n, const u8 *buf, + size_t len, size_t offset, bool noCase, + m256 caseMask, m256 mask1, + const struct cb_info *cbi, size_t start, + size_t end) { + const u8 *d = buf + offset; + DEBUG_PRINTF("start %zu end %zu offset %zu\n", start, end, offset); + const size_t l = end - start; + + m256 v = loadu256(d); + + if (noCase) { + v = and256(v, caseMask); + } + + u32 z = movemask256(eq256(mask1, v)); + + u32 buf_off = start - offset; + u32 mask = (u32)((u64a)(1ULL << l) - 1) << buf_off; + DEBUG_PRINTF("mask 0x%08x z 0x%08x\n", mask, z); + + z &= mask; + + SINGLE_ZSCAN(); + + return HWLM_SUCCESS; +} + +static really_inline +hwlm_error_t scanDoubleUnaligned(const struct noodTable *n, const u8 *buf, + size_t len, size_t offset, bool noCase, + m256 caseMask, m256 mask1, m256 mask2, + const struct cb_info *cbi, size_t start, + size_t end) { + const u8 *d = buf + offset; + DEBUG_PRINTF("start %zu end %zu offset %zu\n", start, end, offset); + size_t l = end - start; + + m256 v = loadu256(d); + + if (noCase) { + v = and256(v, caseMask); + } + + u32 z0 = movemask256(eq256(mask1, v)); + u32 z1 = movemask256(eq256(mask2, v)); + u32 z = (z0 << 1) & z1; + + // mask out where we can't match + u32 buf_off = start - offset; + u32 mask = (u32)((u64a)(1ULL << l) - 1) << buf_off; + DEBUG_PRINTF("mask 0x%08x z 0x%08x\n", mask, z); + z &= mask; + + DOUBLE_ZSCAN(); + + return HWLM_SUCCESS; +} + +// The short scan routine. It is used both to scan data up to an +// alignment boundary if needed and to finish off data that the aligned scan +// function can't handle (due to small/unaligned chunk at end) +static really_inline +hwlm_error_t scanSingleShort(const struct noodTable *n, const u8 *buf, + size_t len, bool noCase, m256 caseMask, m256 mask1, + const struct cb_info *cbi, size_t start, + size_t end) { + const u8 *d = buf + start; + size_t l = end - start; + DEBUG_PRINTF("l %zu\n", l); + assert(l <= 32); + if (!l) { + return HWLM_SUCCESS; + } + + m256 v; + if (l < 4) { + u8 *vp = (u8*)&v; + switch (l) { + case 3: + vp[2] = d[2]; // fallthrough + fallthrough; + case 2: + vp[1] = d[1]; // fallthrough + fallthrough; + case 1: + vp[0] = d[0]; // fallthrough + } + } else { + v = masked_move256_len(d, l); + } + + if (noCase) { + v = and256(v, caseMask); + } + + // mask out where we can't match + u32 mask = (0xFFFFFFFF >> (32 - l)); + + u32 z = mask & movemask256(eq256(mask1, v)); + + SINGLE_ZSCAN(); + + return HWLM_SUCCESS; +} + +static really_inline +hwlm_error_t scanDoubleShort(const struct noodTable *n, const u8 *buf, + size_t len, bool noCase, m256 caseMask, m256 mask1, + m256 mask2, const struct cb_info *cbi, + size_t start, size_t end) { + const u8 *d = buf + start; + size_t l = end - start; + if (!l) { + return HWLM_SUCCESS; + } + assert(l <= 32); + m256 v; + + DEBUG_PRINTF("d %zu\n", d - buf); + + if (l < 4) { + u8 *vp = (u8*)&v; + switch (l) { + case 3: + vp[2] = d[2]; // fallthrough + fallthrough; + case 2: + vp[1] = d[1]; // fallthrough + fallthrough; + case 1: + vp[0] = d[0]; // fallthrough + } + } else { + v = masked_move256_len(d, l); + } + + if (noCase) { + v = and256(v, caseMask); + } + + u32 z0 = movemask256(eq256(mask1, v)); + u32 z1 = movemask256(eq256(mask2, v)); + u32 z = (z0 << 1) & z1; + + // mask out where we can't match + u32 mask = (0xFFFFFFFF >> (32 - l)); + z &= mask; + + DOUBLE_ZSCAN(); + + return HWLM_SUCCESS; +} + +static really_inline +hwlm_error_t scanSingleFast(const struct noodTable *n, const u8 *buf, + size_t len, bool noCase, m256 caseMask, m256 mask1, + const struct cb_info *cbi, size_t start, + size_t end) { + const u8 *d = buf + start, *e = buf + end; + assert(d < e); + + for (; d < e; d += 32) { + m256 v = noCase ? and256(load256(d), caseMask) : load256(d); + + u32 z = movemask256(eq256(mask1, v)); + + // On large packet buffers, this prefetch appears to get us about 2%. + __builtin_prefetch(d + 128); + + SINGLE_ZSCAN(); + } + return HWLM_SUCCESS; +} + +static really_inline +hwlm_error_t scanDoubleFast(const struct noodTable *n, const u8 *buf, + size_t len, bool noCase, m256 caseMask, m256 mask1, + m256 mask2, const struct cb_info *cbi, size_t start, + size_t end) { + const u8 *d = buf + start, *e = buf + end; + DEBUG_PRINTF("start %zu end %zu \n", start, end); + assert(d < e); + u32 lastz0 = 0; + + for (; d < e; d += 32) { + m256 v = noCase ? and256(load256(d), caseMask) : load256(d); + + // we have to pull the masks out of the AVX registers because we can't + // byte shift between the lanes + u32 z0 = movemask256(eq256(mask1, v)); + u32 z1 = movemask256(eq256(mask2, v)); + u32 z = (lastz0 | (z0 << 1)) & z1; + lastz0 = z0 >> 31; + + // On large packet buffers, this prefetch appears to get us about 2%. + __builtin_prefetch(d + 128); + + DOUBLE_ZSCAN(); + + } + return HWLM_SUCCESS; +} + diff --git a/regex/hwlm/noodle_engine_avx512.c b/regex/hwlm/noodle_engine_avx512.c new file mode 100644 index 000000000..8cac1b15c --- /dev/null +++ b/regex/hwlm/noodle_engine_avx512.c @@ -0,0 +1,191 @@ +/* + * Copyright (c) 2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* noodle scan parts for AVX512 */ + +static really_inline +m512 getMask(u8 c, bool noCase) { + u8 k = caseClear8(c, noCase); + return set64x8(k); +} + +static really_inline +m512 getCaseMask(void) { + return set64x8(CASE_CLEAR); +} + +// The short scan routine. It is used both to scan data up to an +// alignment boundary if needed and to finish off data that the aligned scan +// function can't handle (due to small/unaligned chunk at end) +static really_inline +hwlm_error_t scanSingleShort(const struct noodTable *n, const u8 *buf, + size_t len, bool noCase, m512 caseMask, m512 mask1, + const struct cb_info *cbi, size_t start, + size_t end) { + const u8 *d = buf + start; + ptrdiff_t scan_len = end - start; + DEBUG_PRINTF("scan_len %zu\n", scan_len); + assert(scan_len <= 64); + if (!scan_len) { + return HWLM_SUCCESS; + } + + __mmask64 k = (~0ULL) >> (64 - scan_len); + DEBUG_PRINTF("load mask 0x%016llx\n", k); + + m512 v = loadu_maskz_m512(k, d); + + if (noCase) { + v = and512(v, caseMask); + } + + // reuse the load mask to indicate valid bytes + u64a z = masked_eq512mask(k, mask1, v); + + SINGLE_ZSCAN(); + + return HWLM_SUCCESS; +} + +static really_inline +hwlm_error_t scanSingle512(const struct noodTable *n, const u8 *buf, size_t len, + bool noCase, m512 caseMask, m512 mask1, + const struct cb_info *cbi, size_t start, + size_t end) { + const u8 *d = buf + start; + const u8 *e = buf + end; + DEBUG_PRINTF("start %p end %p \n", d, e); + assert(d < e); + if (d + 64 >= e) { + goto tail; + } + + // peel off first part to cacheline boundary + const u8 *d1 = ROUNDUP_PTR(d, 64); + if (scanSingleShort(n, buf, len, noCase, caseMask, mask1, cbi, start, + d1 - buf) == HWLM_TERMINATED) { + return HWLM_TERMINATED; + } + d = d1; + + for (; d + 64 < e; d += 64) { + DEBUG_PRINTF("d %p e %p \n", d, e); + m512 v = noCase ? and512(load512(d), caseMask) : load512(d); + + u64a z = eq512mask(mask1, v); + __builtin_prefetch(d + 128); + + SINGLE_ZSCAN(); + } + +tail: + DEBUG_PRINTF("d %p e %p \n", d, e); + // finish off tail + + return scanSingleShort(n, buf, len, noCase, caseMask, mask1, cbi, d - buf, + e - buf); +} + +static really_inline +hwlm_error_t scanDoubleShort(const struct noodTable *n, const u8 *buf, + size_t len, bool noCase, m512 caseMask, m512 mask1, + m512 mask2, const struct cb_info *cbi, + u64a *lastz0, size_t start, size_t end) { + DEBUG_PRINTF("start %zu end %zu last 0x%016llx\n", start, end, *lastz0); + const u8 *d = buf + start; + ptrdiff_t scan_len = end - start; + if (!scan_len) { + return HWLM_SUCCESS; + } + assert(scan_len <= 64); + __mmask64 k = (~0ULL) >> (64 - scan_len); + DEBUG_PRINTF("load mask 0x%016llx scan_len %zu\n", k, scan_len); + + m512 v = loadu_maskz_m512(k, d); + if (noCase) { + v = and512(v, caseMask); + } + + u64a z0 = masked_eq512mask(k, mask1, v); + u64a z1 = masked_eq512mask(k, mask2, v); + u64a z = (*lastz0 | (z0 << 1)) & z1; + DEBUG_PRINTF("z 0x%016llx\n", z); + + DOUBLE_ZSCAN(); + *lastz0 = z0 >> (scan_len - 1); + return HWLM_SUCCESS; +} + +static really_inline +hwlm_error_t scanDouble512(const struct noodTable *n, const u8 *buf, size_t len, + bool noCase, m512 caseMask, m512 mask1, m512 mask2, + const struct cb_info *cbi, size_t start, + size_t end) { + const u8 *d = buf + start; + const u8 *e = buf + end; + u64a lastz0 = 0; + DEBUG_PRINTF("start %zu end %zu \n", start, end); + assert(d < e); + if (d + 64 >= e) { + goto tail; + } + + // peel off first part to cacheline boundary + const u8 *d1 = ROUNDUP_PTR(d, 64); + if (scanDoubleShort(n, buf, len, noCase, caseMask, mask1, mask2, cbi, + &lastz0, start, d1 - buf) == HWLM_TERMINATED) { + return HWLM_TERMINATED; + } + d = d1; + + for (; d + 64 < e; d += 64) { + DEBUG_PRINTF("d %p e %p 0x%016llx\n", d, e, lastz0); + m512 v = noCase ? and512(load512(d), caseMask) : load512(d); + + /* we have to pull the masks out of the AVX registers because we can't + byte shift between the lanes */ + u64a z0 = eq512mask(mask1, v); + u64a z1 = eq512mask(mask2, v); + u64a z = (lastz0 | (z0 << 1)) & z1; + lastz0 = z0 >> 63; + + // On large packet buffers, this prefetch appears to get us about 2%. + __builtin_prefetch(d + 256); + + DEBUG_PRINTF("z 0x%016llx\n", z); + + DOUBLE_ZSCAN(); + } + +tail: + DEBUG_PRINTF("d %p e %p off %zu \n", d, e, d - buf); + // finish off tail + + return scanDoubleShort(n, buf, len, noCase, caseMask, mask1, mask2, cbi, + &lastz0, d - buf, end); +} diff --git a/regex/hwlm/noodle_engine_sse.c b/regex/hwlm/noodle_engine_sse.c new file mode 100644 index 000000000..7cd53d7ce --- /dev/null +++ b/regex/hwlm/noodle_engine_sse.c @@ -0,0 +1,203 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* noodle scan parts for SSE */ + +static really_inline m128 getMask(u8 c, bool noCase) { + u8 k = caseClear8(c, noCase); + return set16x8(k); +} + +static really_inline m128 getCaseMask(void) { + return set16x8(0xdf); +} + +static really_inline +hwlm_error_t scanSingleShort(const struct noodTable *n, const u8 *buf, + size_t len, bool noCase, m128 caseMask, m128 mask1, + const struct cb_info *cbi, size_t start, + size_t end) { + const u8 *d = buf + start; + size_t l = end - start; + DEBUG_PRINTF("l %zu\n", l); + assert(l <= 16); + if (!l) { + return HWLM_SUCCESS; + } + m128 v = zeroes128(); + // we don't have a clever way of doing this move yet + memcpy(&v, d, l); + if (noCase) { + v = and128(v, caseMask); + } + + // mask out where we can't match + u32 mask = (0xFFFF >> (16 - l)); + + u32 z = mask & movemask128(eq128(mask1, v)); + + SINGLE_ZSCAN(); + + return HWLM_SUCCESS; +} + +static really_inline +hwlm_error_t scanSingleUnaligned(const struct noodTable *n, const u8 *buf, + size_t len, size_t offset, bool noCase, + m128 caseMask, m128 mask1, + const struct cb_info *cbi, size_t start, + size_t end) { + const u8 *d = buf + offset; + DEBUG_PRINTF("start %zu end %zu offset %zu\n", start, end, offset); + const size_t l = end - start; + + m128 v = loadu128(d); + + if (noCase) { + v = and128(v, caseMask); + } + + u32 buf_off = start - offset; + u32 mask = ((1 << l) - 1) << buf_off; + + u32 z = mask & movemask128(eq128(mask1, v)); + + DEBUG_PRINTF("mask 0x%08x z 0x%08x\n", mask, z); + + z &= mask; + + SINGLE_ZSCAN(); + + return HWLM_SUCCESS; +} + +static really_inline +hwlm_error_t scanDoubleShort(const struct noodTable *n, const u8 *buf, + size_t len, bool noCase, m128 caseMask, m128 mask1, + m128 mask2, const struct cb_info *cbi, + size_t start, size_t end) { + const u8 *d = buf + start; + size_t l = end - start; + if (!l) { + return HWLM_SUCCESS; + } + assert(l <= 32); + + DEBUG_PRINTF("d %zu\n", d - buf); + m128 v = zeroes128(); + memcpy(&v, d, l); + if (noCase) { + v = and128(v, caseMask); + } + + u32 z = movemask128(and128(lshiftbyte_m128(eq128(mask1, v), 1), + eq128(mask2, v))); + + // mask out where we can't match + u32 mask = (0xFFFF >> (16 - l)); + z &= mask; + + DOUBLE_ZSCAN(); + + return HWLM_SUCCESS; +} + +static really_inline +hwlm_error_t scanDoubleUnaligned(const struct noodTable *n, const u8 *buf, + size_t len, size_t offset, bool noCase, + m128 caseMask, m128 mask1, m128 mask2, + const struct cb_info *cbi, size_t start, + size_t end) { + const u8 *d = buf + offset; + DEBUG_PRINTF("start %zu end %zu offset %zu\n", start, end, offset); + size_t l = end - start; + + m128 v = loadu128(d); + + if (noCase) { + v = and128(v, caseMask); + } + + u32 z = movemask128(and128(lshiftbyte_m128(eq128(mask1, v), 1), + eq128(mask2, v))); + + // mask out where we can't match + u32 buf_off = start - offset; + u32 mask = ((1 << l) - 1) << buf_off; + DEBUG_PRINTF("mask 0x%08x z 0x%08x\n", mask, z); + z &= mask; + + DOUBLE_ZSCAN(); + + return HWLM_SUCCESS; +} + +static really_inline +hwlm_error_t scanSingleFast(const struct noodTable *n, const u8 *buf, + size_t len, bool noCase, m128 caseMask, m128 mask1, + const struct cb_info *cbi, size_t start, + size_t end) { + const u8 *d = buf + start, *e = buf + end; + assert(d < e); + + for (; d < e; d += 16) { + m128 v = noCase ? and128(load128(d), caseMask) : load128(d); + + u32 z = movemask128(eq128(mask1, v)); + + // On large packet buffers, this prefetch appears to get us about 2%. + __builtin_prefetch(d + 128); + + SINGLE_ZSCAN(); + } + return HWLM_SUCCESS; +} + +static really_inline +hwlm_error_t scanDoubleFast(const struct noodTable *n, const u8 *buf, + size_t len, bool noCase, m128 caseMask, m128 mask1, + m128 mask2, const struct cb_info *cbi, size_t start, + size_t end) { + const u8 *d = buf + start, *e = buf + end; + assert(d < e); + m128 lastz1 = zeroes128(); + + for (; d < e; d += 16) { + m128 v = noCase ? and128(load128(d), caseMask) : load128(d); + m128 z1 = eq128(mask1, v); + m128 z2 = eq128(mask2, v); + u32 z = movemask128(and128(palignr(z1, lastz1, 15), z2)); + lastz1 = z1; + + // On large packet buffers, this prefetch appears to get us about 2%. + __builtin_prefetch(d + 128); + DEBUG_PRINTF("z 0x%08x\n", z); + DOUBLE_ZSCAN(); + } + return HWLM_SUCCESS; +} diff --git a/regex/hwlm/noodle_internal.h b/regex/hwlm/noodle_internal.h new file mode 100644 index 000000000..8f76f177e --- /dev/null +++ b/regex/hwlm/noodle_internal.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Data structures for Noodle literal matcher engine. + */ + +#ifndef NOODLE_INTERNAL_H +#define NOODLE_INTERNAL_H + +#include "ue2common.h" + +struct noodTable { + u32 id; + u64a msk; + u64a cmp; + u8 msk_len; + u8 key_offset; + u8 nocase; + u8 single; + u8 key0; + u8 key1; +}; + +#endif /* NOODLE_INTERNAL_H */ + diff --git a/regex/kmod/.clang-format b/regex/kmod/.clang-format new file mode 100644 index 000000000..1247d54f9 --- /dev/null +++ b/regex/kmod/.clang-format @@ -0,0 +1,683 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# clang-format configuration file. Intended for clang-format >= 11. +# +# For more information, see: +# +# Documentation/process/clang-format.rst +# https://clang.llvm.org/docs/ClangFormat.html +# https://clang.llvm.org/docs/ClangFormatStyleOptions.html +# +--- +AccessModifierOffset: -4 +AlignAfterOpenBracket: Align +AlignConsecutiveAssignments: false +AlignConsecutiveDeclarations: false +AlignEscapedNewlines: Left +AlignOperands: true +AlignTrailingComments: false +AllowAllParametersOfDeclarationOnNextLine: false +AllowShortBlocksOnASingleLine: false +AllowShortCaseLabelsOnASingleLine: false +AllowShortFunctionsOnASingleLine: None +AllowShortIfStatementsOnASingleLine: false +AllowShortLoopsOnASingleLine: false +AlwaysBreakAfterDefinitionReturnType: None +AlwaysBreakAfterReturnType: None +AlwaysBreakBeforeMultilineStrings: false +AlwaysBreakTemplateDeclarations: false +BinPackArguments: true +BinPackParameters: true +BraceWrapping: + AfterClass: false + AfterControlStatement: false + AfterEnum: false + AfterFunction: true + AfterNamespace: true + AfterObjCDeclaration: false + AfterStruct: false + AfterUnion: false + AfterExternBlock: false + BeforeCatch: false + BeforeElse: false + IndentBraces: false + SplitEmptyFunction: true + SplitEmptyRecord: true + SplitEmptyNamespace: true +BreakBeforeBinaryOperators: None +BreakBeforeBraces: Custom +BreakBeforeInheritanceComma: false +BreakBeforeTernaryOperators: false +BreakConstructorInitializersBeforeComma: false +BreakConstructorInitializers: BeforeComma +BreakAfterJavaFieldAnnotations: false +BreakStringLiterals: false +ColumnLimit: 80 +CommentPragmas: '^ IWYU pragma:' +CompactNamespaces: false +ConstructorInitializerAllOnOneLineOrOnePerLine: false +ConstructorInitializerIndentWidth: 8 +ContinuationIndentWidth: 8 +Cpp11BracedListStyle: false +DerivePointerAlignment: false +DisableFormat: false +ExperimentalAutoDetectBinPacking: false +FixNamespaceComments: false + +# Taken from: +# git grep -h '^#define [^[:space:]]*for_each[^[:space:]]*(' include/ tools/ \ +# | sed "s,^#define \([^[:space:]]*for_each[^[:space:]]*\)(.*$, - '\1'," \ +# | LC_ALL=C sort -u +ForEachMacros: + - '__ata_qc_for_each' + - '__bio_for_each_bvec' + - '__bio_for_each_segment' + - '__evlist__for_each_entry' + - '__evlist__for_each_entry_continue' + - '__evlist__for_each_entry_from' + - '__evlist__for_each_entry_reverse' + - '__evlist__for_each_entry_safe' + - '__for_each_mem_range' + - '__for_each_mem_range_rev' + - '__for_each_thread' + - '__hlist_for_each_rcu' + - '__map__for_each_symbol_by_name' + - '__perf_evlist__for_each_entry' + - '__perf_evlist__for_each_entry_reverse' + - '__perf_evlist__for_each_entry_safe' + - '__rq_for_each_bio' + - '__shost_for_each_device' + - 'apei_estatus_for_each_section' + - 'ata_for_each_dev' + - 'ata_for_each_link' + - 'ata_qc_for_each' + - 'ata_qc_for_each_raw' + - 'ata_qc_for_each_with_internal' + - 'ax25_for_each' + - 'ax25_uid_for_each' + - 'bio_for_each_bvec' + - 'bio_for_each_bvec_all' + - 'bio_for_each_folio_all' + - 'bio_for_each_integrity_vec' + - 'bio_for_each_segment' + - 'bio_for_each_segment_all' + - 'bio_list_for_each' + - 'bip_for_each_vec' + - 'bond_for_each_slave' + - 'bond_for_each_slave_rcu' + - 'bpf__perf_for_each_map' + - 'bpf__perf_for_each_map_named' + - 'bpf_for_each_spilled_reg' + - 'bpf_object__for_each_map' + - 'bpf_object__for_each_program' + - 'bpf_object__for_each_safe' + - 'bpf_perf_object__for_each' + - 'btree_for_each_safe128' + - 'btree_for_each_safe32' + - 'btree_for_each_safe64' + - 'btree_for_each_safel' + - 'card_for_each_dev' + - 'cgroup_taskset_for_each' + - 'cgroup_taskset_for_each_leader' + - 'cpufreq_for_each_efficient_entry_idx' + - 'cpufreq_for_each_entry' + - 'cpufreq_for_each_entry_idx' + - 'cpufreq_for_each_valid_entry' + - 'cpufreq_for_each_valid_entry_idx' + - 'css_for_each_child' + - 'css_for_each_descendant_post' + - 'css_for_each_descendant_pre' + - 'damon_for_each_region' + - 'damon_for_each_region_safe' + - 'damon_for_each_scheme' + - 'damon_for_each_scheme_safe' + - 'damon_for_each_target' + - 'damon_for_each_target_safe' + - 'data__for_each_file' + - 'data__for_each_file_new' + - 'data__for_each_file_start' + - 'device_for_each_child_node' + - 'displayid_iter_for_each' + - 'dma_fence_array_for_each' + - 'dma_fence_chain_for_each' + - 'dma_fence_unwrap_for_each' + - 'dma_resv_for_each_fence' + - 'dma_resv_for_each_fence_unlocked' + - 'do_for_each_ftrace_op' + - 'drm_atomic_crtc_for_each_plane' + - 'drm_atomic_crtc_state_for_each_plane' + - 'drm_atomic_crtc_state_for_each_plane_state' + - 'drm_atomic_for_each_plane_damage' + - 'drm_client_for_each_connector_iter' + - 'drm_client_for_each_modeset' + - 'drm_connector_for_each_possible_encoder' + - 'drm_for_each_bridge_in_chain' + - 'drm_for_each_connector_iter' + - 'drm_for_each_crtc' + - 'drm_for_each_crtc_reverse' + - 'drm_for_each_encoder' + - 'drm_for_each_encoder_mask' + - 'drm_for_each_fb' + - 'drm_for_each_legacy_plane' + - 'drm_for_each_plane' + - 'drm_for_each_plane_mask' + - 'drm_for_each_privobj' + - 'drm_mm_for_each_hole' + - 'drm_mm_for_each_node' + - 'drm_mm_for_each_node_in_range' + - 'drm_mm_for_each_node_safe' + - 'dsa_switch_for_each_available_port' + - 'dsa_switch_for_each_cpu_port' + - 'dsa_switch_for_each_port' + - 'dsa_switch_for_each_port_continue_reverse' + - 'dsa_switch_for_each_port_safe' + - 'dsa_switch_for_each_user_port' + - 'dsa_tree_for_each_user_port' + - 'dso__for_each_symbol' + - 'dsos__for_each_with_build_id' + - 'elf_hash_for_each_possible' + - 'elf_section__for_each_rel' + - 'elf_section__for_each_rela' + - 'elf_symtab__for_each_symbol' + - 'evlist__for_each_cpu' + - 'evlist__for_each_entry' + - 'evlist__for_each_entry_continue' + - 'evlist__for_each_entry_from' + - 'evlist__for_each_entry_reverse' + - 'evlist__for_each_entry_safe' + - 'flow_action_for_each' + - 'for_each_acpi_dev_match' + - 'for_each_active_dev_scope' + - 'for_each_active_drhd_unit' + - 'for_each_active_iommu' + - 'for_each_aggr_pgid' + - 'for_each_available_child_of_node' + - 'for_each_bench' + - 'for_each_bio' + - 'for_each_board_func_rsrc' + - 'for_each_btf_ext_rec' + - 'for_each_btf_ext_sec' + - 'for_each_bvec' + - 'for_each_card_auxs' + - 'for_each_card_auxs_safe' + - 'for_each_card_components' + - 'for_each_card_dapms' + - 'for_each_card_pre_auxs' + - 'for_each_card_prelinks' + - 'for_each_card_rtds' + - 'for_each_card_rtds_safe' + - 'for_each_card_widgets' + - 'for_each_card_widgets_safe' + - 'for_each_cgroup_storage_type' + - 'for_each_child_of_node' + - 'for_each_clear_bit' + - 'for_each_clear_bit_from' + - 'for_each_clear_bitrange' + - 'for_each_clear_bitrange_from' + - 'for_each_cmd' + - 'for_each_cmsghdr' + - 'for_each_collection' + - 'for_each_comp_order' + - 'for_each_compatible_node' + - 'for_each_component_dais' + - 'for_each_component_dais_safe' + - 'for_each_console' + - 'for_each_cpu' + - 'for_each_cpu_and' + - 'for_each_cpu_not' + - 'for_each_cpu_wrap' + - 'for_each_dapm_widgets' + - 'for_each_dedup_cand' + - 'for_each_dev_addr' + - 'for_each_dev_scope' + - 'for_each_dma_cap_mask' + - 'for_each_dpcm_be' + - 'for_each_dpcm_be_rollback' + - 'for_each_dpcm_be_safe' + - 'for_each_dpcm_fe' + - 'for_each_drhd_unit' + - 'for_each_dss_dev' + - 'for_each_efi_memory_desc' + - 'for_each_efi_memory_desc_in_map' + - 'for_each_element' + - 'for_each_element_extid' + - 'for_each_element_id' + - 'for_each_endpoint_of_node' + - 'for_each_event' + - 'for_each_event_tps' + - 'for_each_evictable_lru' + - 'for_each_fib6_node_rt_rcu' + - 'for_each_fib6_walker_rt' + - 'for_each_free_mem_pfn_range_in_zone' + - 'for_each_free_mem_pfn_range_in_zone_from' + - 'for_each_free_mem_range' + - 'for_each_free_mem_range_reverse' + - 'for_each_func_rsrc' + - 'for_each_group_evsel' + - 'for_each_group_member' + - 'for_each_hstate' + - 'for_each_if' + - 'for_each_inject_fn' + - 'for_each_insn' + - 'for_each_insn_prefix' + - 'for_each_intid' + - 'for_each_iommu' + - 'for_each_ip_tunnel_rcu' + - 'for_each_irq_nr' + - 'for_each_lang' + - 'for_each_link_codecs' + - 'for_each_link_cpus' + - 'for_each_link_platforms' + - 'for_each_lru' + - 'for_each_matching_node' + - 'for_each_matching_node_and_match' + - 'for_each_mem_pfn_range' + - 'for_each_mem_range' + - 'for_each_mem_range_rev' + - 'for_each_mem_region' + - 'for_each_member' + - 'for_each_memory' + - 'for_each_migratetype_order' + - 'for_each_missing_reg' + - 'for_each_net' + - 'for_each_net_continue_reverse' + - 'for_each_net_rcu' + - 'for_each_netdev' + - 'for_each_netdev_continue' + - 'for_each_netdev_continue_rcu' + - 'for_each_netdev_continue_reverse' + - 'for_each_netdev_feature' + - 'for_each_netdev_in_bond_rcu' + - 'for_each_netdev_rcu' + - 'for_each_netdev_reverse' + - 'for_each_netdev_safe' + - 'for_each_new_connector_in_state' + - 'for_each_new_crtc_in_state' + - 'for_each_new_mst_mgr_in_state' + - 'for_each_new_plane_in_state' + - 'for_each_new_plane_in_state_reverse' + - 'for_each_new_private_obj_in_state' + - 'for_each_new_reg' + - 'for_each_node' + - 'for_each_node_by_name' + - 'for_each_node_by_type' + - 'for_each_node_mask' + - 'for_each_node_state' + - 'for_each_node_with_cpus' + - 'for_each_node_with_property' + - 'for_each_nonreserved_multicast_dest_pgid' + - 'for_each_of_allnodes' + - 'for_each_of_allnodes_from' + - 'for_each_of_cpu_node' + - 'for_each_of_pci_range' + - 'for_each_old_connector_in_state' + - 'for_each_old_crtc_in_state' + - 'for_each_old_mst_mgr_in_state' + - 'for_each_old_plane_in_state' + - 'for_each_old_private_obj_in_state' + - 'for_each_oldnew_connector_in_state' + - 'for_each_oldnew_crtc_in_state' + - 'for_each_oldnew_mst_mgr_in_state' + - 'for_each_oldnew_plane_in_state' + - 'for_each_oldnew_plane_in_state_reverse' + - 'for_each_oldnew_private_obj_in_state' + - 'for_each_online_cpu' + - 'for_each_online_node' + - 'for_each_online_pgdat' + - 'for_each_path' + - 'for_each_pci_bridge' + - 'for_each_pci_dev' + - 'for_each_pcm_streams' + - 'for_each_physmem_range' + - 'for_each_populated_zone' + - 'for_each_possible_cpu' + - 'for_each_present_cpu' + - 'for_each_prime_number' + - 'for_each_prime_number_from' + - 'for_each_probe_cache_entry' + - 'for_each_process' + - 'for_each_process_thread' + - 'for_each_prop_codec_conf' + - 'for_each_prop_dai_codec' + - 'for_each_prop_dai_cpu' + - 'for_each_prop_dlc_codecs' + - 'for_each_prop_dlc_cpus' + - 'for_each_prop_dlc_platforms' + - 'for_each_property_of_node' + - 'for_each_reg' + - 'for_each_reg_filtered' + - 'for_each_registered_fb' + - 'for_each_requested_gpio' + - 'for_each_requested_gpio_in_range' + - 'for_each_reserved_mem_range' + - 'for_each_reserved_mem_region' + - 'for_each_rtd_codec_dais' + - 'for_each_rtd_components' + - 'for_each_rtd_cpu_dais' + - 'for_each_rtd_dais' + - 'for_each_script' + - 'for_each_sec' + - 'for_each_set_bit' + - 'for_each_set_bit_from' + - 'for_each_set_bitrange' + - 'for_each_set_bitrange_from' + - 'for_each_set_clump8' + - 'for_each_sg' + - 'for_each_sg_dma_page' + - 'for_each_sg_page' + - 'for_each_sgtable_dma_page' + - 'for_each_sgtable_dma_sg' + - 'for_each_sgtable_page' + - 'for_each_sgtable_sg' + - 'for_each_shell_test' + - 'for_each_sibling_event' + - 'for_each_subelement' + - 'for_each_subelement_extid' + - 'for_each_subelement_id' + - 'for_each_sublist' + - 'for_each_subsystem' + - 'for_each_supported_activate_fn' + - 'for_each_supported_inject_fn' + - 'for_each_test' + - 'for_each_thread' + - 'for_each_token' + - 'for_each_unicast_dest_pgid' + - 'for_each_vsi' + - 'for_each_wakeup_source' + - 'for_each_zone' + - 'for_each_zone_zonelist' + - 'for_each_zone_zonelist_nodemask' + - 'func_for_each_insn' + - 'fwnode_for_each_available_child_node' + - 'fwnode_for_each_child_node' + - 'fwnode_graph_for_each_endpoint' + - 'gadget_for_each_ep' + - 'genradix_for_each' + - 'genradix_for_each_from' + - 'hash_for_each' + - 'hash_for_each_possible' + - 'hash_for_each_possible_rcu' + - 'hash_for_each_possible_rcu_notrace' + - 'hash_for_each_possible_safe' + - 'hash_for_each_rcu' + - 'hash_for_each_safe' + - 'hashmap__for_each_entry' + - 'hashmap__for_each_entry_safe' + - 'hashmap__for_each_key_entry' + - 'hashmap__for_each_key_entry_safe' + - 'hctx_for_each_ctx' + - 'hists__for_each_format' + - 'hists__for_each_sort_list' + - 'hlist_bl_for_each_entry' + - 'hlist_bl_for_each_entry_rcu' + - 'hlist_bl_for_each_entry_safe' + - 'hlist_for_each' + - 'hlist_for_each_entry' + - 'hlist_for_each_entry_continue' + - 'hlist_for_each_entry_continue_rcu' + - 'hlist_for_each_entry_continue_rcu_bh' + - 'hlist_for_each_entry_from' + - 'hlist_for_each_entry_from_rcu' + - 'hlist_for_each_entry_rcu' + - 'hlist_for_each_entry_rcu_bh' + - 'hlist_for_each_entry_rcu_notrace' + - 'hlist_for_each_entry_safe' + - 'hlist_for_each_entry_srcu' + - 'hlist_for_each_safe' + - 'hlist_nulls_for_each_entry' + - 'hlist_nulls_for_each_entry_from' + - 'hlist_nulls_for_each_entry_rcu' + - 'hlist_nulls_for_each_entry_safe' + - 'i3c_bus_for_each_i2cdev' + - 'i3c_bus_for_each_i3cdev' + - 'idr_for_each_entry' + - 'idr_for_each_entry_continue' + - 'idr_for_each_entry_continue_ul' + - 'idr_for_each_entry_ul' + - 'in_dev_for_each_ifa_rcu' + - 'in_dev_for_each_ifa_rtnl' + - 'inet_bind_bucket_for_each' + - 'inet_lhash2_for_each_icsk' + - 'inet_lhash2_for_each_icsk_continue' + - 'inet_lhash2_for_each_icsk_rcu' + - 'intlist__for_each_entry' + - 'intlist__for_each_entry_safe' + - 'kcore_copy__for_each_phdr' + - 'key_for_each' + - 'key_for_each_safe' + - 'klp_for_each_func' + - 'klp_for_each_func_safe' + - 'klp_for_each_func_static' + - 'klp_for_each_object' + - 'klp_for_each_object_safe' + - 'klp_for_each_object_static' + - 'kunit_suite_for_each_test_case' + - 'kvm_for_each_memslot' + - 'kvm_for_each_memslot_in_gfn_range' + - 'kvm_for_each_vcpu' + - 'libbpf_nla_for_each_attr' + - 'list_for_each' + - 'list_for_each_codec' + - 'list_for_each_codec_safe' + - 'list_for_each_continue' + - 'list_for_each_entry' + - 'list_for_each_entry_continue' + - 'list_for_each_entry_continue_rcu' + - 'list_for_each_entry_continue_reverse' + - 'list_for_each_entry_from' + - 'list_for_each_entry_from_rcu' + - 'list_for_each_entry_from_reverse' + - 'list_for_each_entry_lockless' + - 'list_for_each_entry_rcu' + - 'list_for_each_entry_reverse' + - 'list_for_each_entry_safe' + - 'list_for_each_entry_safe_continue' + - 'list_for_each_entry_safe_from' + - 'list_for_each_entry_safe_reverse' + - 'list_for_each_entry_srcu' + - 'list_for_each_from' + - 'list_for_each_prev' + - 'list_for_each_prev_safe' + - 'list_for_each_safe' + - 'llist_for_each' + - 'llist_for_each_entry' + - 'llist_for_each_entry_safe' + - 'llist_for_each_safe' + - 'map__for_each_symbol' + - 'map__for_each_symbol_by_name' + - 'map_for_each_event' + - 'map_for_each_metric' + - 'maps__for_each_entry' + - 'maps__for_each_entry_safe' + - 'mci_for_each_dimm' + - 'media_device_for_each_entity' + - 'media_device_for_each_intf' + - 'media_device_for_each_link' + - 'media_device_for_each_pad' + - 'msi_for_each_desc' + - 'nanddev_io_for_each_page' + - 'netdev_for_each_lower_dev' + - 'netdev_for_each_lower_private' + - 'netdev_for_each_lower_private_rcu' + - 'netdev_for_each_mc_addr' + - 'netdev_for_each_uc_addr' + - 'netdev_for_each_upper_dev_rcu' + - 'netdev_hw_addr_list_for_each' + - 'nft_rule_for_each_expr' + - 'nla_for_each_attr' + - 'nla_for_each_nested' + - 'nlmsg_for_each_attr' + - 'nlmsg_for_each_msg' + - 'nr_neigh_for_each' + - 'nr_neigh_for_each_safe' + - 'nr_node_for_each' + - 'nr_node_for_each_safe' + - 'of_for_each_phandle' + - 'of_property_for_each_string' + - 'of_property_for_each_u32' + - 'pci_bus_for_each_resource' + - 'pci_doe_for_each_off' + - 'pcl_for_each_chunk' + - 'pcl_for_each_segment' + - 'pcm_for_each_format' + - 'perf_config_items__for_each_entry' + - 'perf_config_sections__for_each_entry' + - 'perf_config_set__for_each_entry' + - 'perf_cpu_map__for_each_cpu' + - 'perf_evlist__for_each_entry' + - 'perf_evlist__for_each_entry_reverse' + - 'perf_evlist__for_each_entry_safe' + - 'perf_evlist__for_each_evsel' + - 'perf_evlist__for_each_mmap' + - 'perf_hpp_list__for_each_format' + - 'perf_hpp_list__for_each_format_safe' + - 'perf_hpp_list__for_each_sort_list' + - 'perf_hpp_list__for_each_sort_list_safe' + - 'perf_pmu__for_each_hybrid_pmu' + - 'ping_portaddr_for_each_entry' + - 'plist_for_each' + - 'plist_for_each_continue' + - 'plist_for_each_entry' + - 'plist_for_each_entry_continue' + - 'plist_for_each_entry_safe' + - 'plist_for_each_safe' + - 'pnp_for_each_card' + - 'pnp_for_each_dev' + - 'protocol_for_each_card' + - 'protocol_for_each_dev' + - 'queue_for_each_hw_ctx' + - 'radix_tree_for_each_slot' + - 'radix_tree_for_each_tagged' + - 'rb_for_each' + - 'rbtree_postorder_for_each_entry_safe' + - 'rdma_for_each_block' + - 'rdma_for_each_port' + - 'rdma_umem_for_each_dma_block' + - 'resort_rb__for_each_entry' + - 'resource_list_for_each_entry' + - 'resource_list_for_each_entry_safe' + - 'rhl_for_each_entry_rcu' + - 'rhl_for_each_rcu' + - 'rht_for_each' + - 'rht_for_each_entry' + - 'rht_for_each_entry_from' + - 'rht_for_each_entry_rcu' + - 'rht_for_each_entry_rcu_from' + - 'rht_for_each_entry_safe' + - 'rht_for_each_from' + - 'rht_for_each_rcu' + - 'rht_for_each_rcu_from' + - 'rq_for_each_bvec' + - 'rq_for_each_segment' + - 'rq_list_for_each' + - 'rq_list_for_each_safe' + - 'scsi_for_each_prot_sg' + - 'scsi_for_each_sg' + - 'sctp_for_each_hentry' + - 'sctp_skb_for_each' + - 'sec_for_each_insn' + - 'sec_for_each_insn_continue' + - 'sec_for_each_insn_from' + - 'shdma_for_each_chan' + - 'shost_for_each_device' + - 'sk_for_each' + - 'sk_for_each_bound' + - 'sk_for_each_entry_offset_rcu' + - 'sk_for_each_from' + - 'sk_for_each_rcu' + - 'sk_for_each_safe' + - 'sk_nulls_for_each' + - 'sk_nulls_for_each_from' + - 'sk_nulls_for_each_rcu' + - 'snd_array_for_each' + - 'snd_pcm_group_for_each_entry' + - 'snd_soc_dapm_widget_for_each_path' + - 'snd_soc_dapm_widget_for_each_path_safe' + - 'snd_soc_dapm_widget_for_each_sink_path' + - 'snd_soc_dapm_widget_for_each_source_path' + - 'strlist__for_each_entry' + - 'strlist__for_each_entry_safe' + - 'sym_for_each_insn' + - 'sym_for_each_insn_continue_reverse' + - 'symbols__for_each_entry' + - 'tb_property_for_each' + - 'tcf_act_for_each_action' + - 'tcf_exts_for_each_action' + - 'udp_portaddr_for_each_entry' + - 'udp_portaddr_for_each_entry_rcu' + - 'usb_hub_for_each_child' + - 'v4l2_device_for_each_subdev' + - 'v4l2_m2m_for_each_dst_buf' + - 'v4l2_m2m_for_each_dst_buf_safe' + - 'v4l2_m2m_for_each_src_buf' + - 'v4l2_m2m_for_each_src_buf_safe' + - 'virtio_device_for_each_vq' + - 'while_for_each_ftrace_op' + - 'xa_for_each' + - 'xa_for_each_marked' + - 'xa_for_each_range' + - 'xa_for_each_start' + - 'xas_for_each' + - 'xas_for_each_conflict' + - 'xas_for_each_marked' + - 'xbc_array_for_each_value' + - 'xbc_for_each_key_value' + - 'xbc_node_for_each_array_value' + - 'xbc_node_for_each_child' + - 'xbc_node_for_each_key_value' + - 'xbc_node_for_each_subkey' + - 'zorro_for_each_dev' + +IncludeBlocks: Preserve +IncludeCategories: + - Regex: '.*' + Priority: 1 +IncludeIsMainRegex: '(Test)?$' +IndentCaseLabels: false +IndentGotoLabels: false +IndentPPDirectives: None +IndentWidth: 8 +IndentWrappedFunctionNames: false +JavaScriptQuotes: Leave +JavaScriptWrapImports: true +KeepEmptyLinesAtTheStartOfBlocks: false +MacroBlockBegin: '' +MacroBlockEnd: '' +MaxEmptyLinesToKeep: 1 +NamespaceIndentation: None +ObjCBinPackProtocolList: Auto +ObjCBlockIndentWidth: 8 +ObjCSpaceAfterProperty: true +ObjCSpaceBeforeProtocolList: true + +# Taken from git's rules +PenaltyBreakAssignment: 10 +PenaltyBreakBeforeFirstCallParameter: 30 +PenaltyBreakComment: 10 +PenaltyBreakFirstLessLess: 0 +PenaltyBreakString: 10 +PenaltyExcessCharacter: 100 +PenaltyReturnTypeOnItsOwnLine: 60 + +PointerAlignment: Right +ReflowComments: false +SortIncludes: false +SortUsingDeclarations: false +SpaceAfterCStyleCast: false +SpaceAfterTemplateKeyword: true +SpaceBeforeAssignmentOperators: true +SpaceBeforeCtorInitializerColon: true +SpaceBeforeInheritanceColon: true +SpaceBeforeParens: ControlStatementsExceptForEachMacros +SpaceBeforeRangeBasedForLoopColon: true +SpaceInEmptyParentheses: false +SpacesBeforeTrailingComments: 1 +SpacesInAngles: false +SpacesInContainerLiterals: false +SpacesInCStyleCastParentheses: false +SpacesInParentheses: false +SpacesInSquareBrackets: false +Standard: Cpp03 +TabWidth: 8 +UseTab: Always +... diff --git a/regex/kmod/config.h b/regex/kmod/config.h new file mode 100644 index 000000000..22f0341f2 --- /dev/null +++ b/regex/kmod/config.h @@ -0,0 +1,109 @@ +/* used by cmake */ + +#ifndef CONFIG_H_ +#define CONFIG_H_ + +/* "Define if the build is 32 bit" */ +/* #undef ARCH_32_BIT */ + +/* "Define if the build is 64 bit" */ +#define ARCH_64_BIT + +/* "Define if building for IA32" */ +/* #undef ARCH_IA32 */ + +/* "Define if building for EM64T" */ +#define ARCH_X86_64 + +/* internal build, switch on dump support. */ +#define DUMP_SUPPORT + +/* Define if building "fat" runtime. */ +/* #undef FAT_RUNTIME */ + +/* Define if building AVX-512 in the fat runtime. */ +/* #undef BUILD_AVX512 */ + +/* Define if building AVX512VBMI in the fat runtime. */ +/* #undef BUILD_AVX512VBMI */ + +/* Define to 1 if `backtrace' works. */ +#define HAVE_BACKTRACE + +/* C compiler has __builtin_assume_aligned */ +#define HAVE_CC_BUILTIN_ASSUME_ALIGNED + +/* C++ compiler has __builtin_assume_aligned */ +#define HAVE_CXX_BUILTIN_ASSUME_ALIGNED + +/* C++ compiler has x86intrin.h */ +#define HAVE_CXX_X86INTRIN_H + +/* C compiler has x86intrin.h */ +#define HAVE_C_X86INTRIN_H + +/* C++ compiler has intrin.h */ +/* #undef HAVE_CXX_INTRIN_H */ + +/* C compiler has intrin.h */ +/* #undef HAVE_C_INTRIN_H */ + +/* Define to 1 if you have the declaration of `pthread_setaffinity_np', and to + 0 if you don't. */ +/* #undef HAVE_DECL_PTHREAD_SETAFFINITY_NP */ + +/* #undef HAVE_PTHREAD_NP_H */ + +/* Define to 1 if you have the `malloc_info' function. */ +/* #undef HAVE_MALLOC_INFO */ + +/* Define to 1 if you have the `memmem' function. */ +/* #undef HAVE_MEMMEM */ + +/* Define to 1 if you have a working `mmap' system call. */ +/* #undef HAVE_MMAP */ + +/* Define to 1 if `posix_memalign' works. */ +/* #undef HAVE_POSIX_MEMALIGN */ + +/* Define to 1 if you have the `setrlimit' function. */ +/* #undef HAVE_SETRLIMIT */ + +/* Define to 1 if you have the `shmget' function. */ +/* #undef HAVE_SHMGET */ + +/* Define to 1 if you have the `sigaction' function. */ +/* #undef HAVE_SIGACTION */ + +/* Define to 1 if you have the `sigaltstack' function. */ +/* #undef HAVE_SIGALTSTACK */ + +/* Define if the sqlite3_open_v2 call is available */ +/* #undef HAVE_SQLITE3_OPEN_V2 */ + +/* Define to 1 if you have the header file. */ +/* #undef HAVE_UNISTD_H */ + +/* Define to 1 if you have the `_aligned_malloc' function. */ +/* #undef HAVE__ALIGNED_MALLOC */ + +/* Define if compiler has __builtin_constant_p */ +#define HAVE__BUILTIN_CONSTANT_P + +/* Optimize, inline critical functions */ +#define HS_OPTIMIZE + +#define HS_VERSION +#define HS_MAJOR_VERSION +#define HS_MINOR_VERSION +/* #undef HS_PATCH_VERSION */ + +#define BUILD_DATE + +/* define if this is a release build. */ +#define RELEASE_BUILD + +/* define if reverse_graph requires patch for boost 1.62.0 */ +/* #undef BOOST_REVGRAPH_PATCH */ + +#endif /* CONFIG_H_ */ diff --git a/regex/kmod/hs_version.h b/regex/kmod/hs_version.h new file mode 100644 index 000000000..f6fd235ba --- /dev/null +++ b/regex/kmod/hs_version.h @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2015, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef HS_VERSION_H_C6428FAF8E3713 +#define HS_VERSION_H_C6428FAF8E3713 + +/** + * A version string to identify this release of Hyperscan. + */ +#define HS_VERSION_STRING "5.4.0 2022-03-31" + +#define HS_VERSION_32BIT ((5 << 24) | (4 << 16) | (0 << 8) | 0) + +#endif /* HS_VERSION_H_C6428FAF8E3713 */ diff --git a/regex/kmod/rex.c b/regex/kmod/rex.c new file mode 100644 index 000000000..69d8638a3 --- /dev/null +++ b/regex/kmod/rex.c @@ -0,0 +1,649 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ +/* SPDX-FileCopyrightText: Copyright 2022 G-Core Labs S.A. */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#define CREATE_TRACE_POINTS +#include "rex_trace.h" +#include "rex.h" + +#include "hs_runtime.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +//#include "fw/str.h" +//#include + +static ulong max_db_size = 4 << 20; +module_param(max_db_size, ulong, 0644); +MODULE_PARM_DESC(max_db_size, "Maximum size of configfs upload, default=4MB"); + +static DEFINE_IDR(rex_idr); +static DEFINE_MUTEX(rex_config_mutex); + +/** A wrapper around hs_database_t where we may store additional fields. */ +struct rex_database { + void __percpu *scratch; /* TODO: make it global */ + u8 bytes[] __aligned(8); +}; + +static inline hs_database_t *patterns(struct rex_database *db) +{ + if (!db) + return NULL; + return (hs_database_t *)db->bytes; +} + +/** + * Represent a configurable hyperscan database. + * @id: Handle used by BPF programs from rex_scan_bytes() kfunc (rw). + * @epoch: Sequential number which may be used to detect changes (ro). + * @note: An arbitrary user string (rw). + * @database: Compiled database binary (rw). + * + * Contains other derived read-only parameters: + * /info: Brief database description. + * + */ +struct rex_policy { + u32 id; + u32 epoch; + struct mutex lock; + struct rex_database __rcu *database; + struct config_item item; + char note[PAGE_SIZE]; +}; + +struct rex_scan_ctx { + struct rex_scan_attr *attr; + const void *block; + size_t block_len; +}; + +static int rex_scan_cb(unsigned int expression, unsigned long long from, + unsigned long long to, unsigned int flags, void *raw_ctx) +{ + struct rex_scan_ctx *ctx = raw_ctx; + struct rex_scan_attr *attr = ctx->attr; + u32 features = attr->handler_flags; + + attr->last_event = (struct rex_event){ + .expression = expression, + .from = from, + .to = to, + .flags = flags, + }; + + trace_rex_match(attr); + attr->nr_events += 1; + + return (features & REX_SINGLE_SHOT) ? 1 : 0; +} + +int bpf_scan_bytes(const void *buf, __u32 buf__sz, struct rex_scan_attr *attr) +{ + struct rex_scan_ctx ctx = { + .attr = attr, + .block = buf, + .block_len = buf__sz, + }; + struct rex_policy *rex; + struct rex_database *db; + hs_scratch_t *scratch; + hs_error_t err; + + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); + + if (unlikely(!buf || !attr)) + return -EINVAL; + + rex = idr_find(&rex_idr, attr->database_id); + if (unlikely(!rex)) + return -EBADF; + + db = rcu_dereference(rex->database); + if (unlikely(!db)) + return -ENODATA; + + scratch = this_cpu_ptr(db->scratch); + + kernel_fpu_begin(); + err = hs_scan(patterns(db), buf, buf__sz, 0, scratch, rex_scan_cb, + &ctx); + kernel_fpu_end(); + + switch (err) { + case HS_DB_MODE_ERROR: + return -ENOEXEC; + case HS_SCAN_TERMINATED: + return 1; + case HS_SUCCESS: + return 0; + case HS_SCRATCH_IN_USE: + case HS_INVALID: + case HS_UNKNOWN_ERROR: + default: + WARN(1, "hs_scan() failed with code %d\n", (int)err); + return -EFAULT; + } +} +EXPORT_SYMBOL(bpf_scan_bytes); + +int bpf_scan_vector(const char *const *buf, + const unsigned int *length, + __u32 buf__sz, + struct rex_scan_attr *attr) +{ + struct rex_scan_ctx ctx = { + .attr = attr, + .block = buf, + .block_len = buf__sz, + }; + struct rex_policy *rex; + struct rex_database *db; + hs_scratch_t *scratch; + hs_error_t err; + + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); + + if (unlikely(!buf || !attr)) + return -EINVAL; + + rex = idr_find(&rex_idr, attr->database_id); + if (unlikely(!rex)) + return -EBADF; + + db = rcu_dereference(rex->database); + if (unlikely(!db)) + return -ENODATA; + + scratch = this_cpu_ptr(db->scratch); + + kernel_fpu_begin(); + err = hs_scan_vector(patterns(db), buf, length, buf__sz, 0, + scratch, rex_scan_cb, &ctx); + kernel_fpu_end(); + + switch (err) { + case HS_DB_MODE_ERROR: + return -ENOEXEC; + case HS_SCAN_TERMINATED: + return 1; + case HS_SUCCESS: + return 0; + case HS_SCRATCH_IN_USE: + case HS_INVALID: + case HS_UNKNOWN_ERROR: + default: + WARN(1, "hs_scan() failed with code %d\n", (int)err); + return -EFAULT; + } +} +EXPORT_SYMBOL(bpf_scan_vector); + +int bpf_scan_tfwstr(const TfwStr *str, + struct rex_scan_attr *attr) +{ + struct rex_scan_ctx ctx = { + .attr = attr, + .block = str, + .block_len = str->len, + }; + struct rex_policy *rex; + struct rex_database *db; + hs_scratch_t *scratch; + hs_error_t err; + + WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); + + if (unlikely(!str || !attr)) + return -EINVAL; + + rex = idr_find(&rex_idr, attr->database_id); + if (unlikely(!rex)) + return -EBADF; + + db = rcu_dereference(rex->database); + if (unlikely(!db)) + return -ENODATA; + + scratch = this_cpu_ptr(db->scratch); + + kernel_fpu_begin(); + + + err = hs_scan_tfwstr(patterns(db), str, 0, + scratch, rex_scan_cb, &ctx); + + kernel_fpu_end(); + + switch (err) { + case HS_DB_MODE_ERROR: + return -ENOEXEC; + case HS_SCAN_TERMINATED: + return 1; + case HS_SUCCESS: + return 0; + case HS_SCRATCH_IN_USE: + case HS_INVALID: + case HS_UNKNOWN_ERROR: + default: + WARN(1, "hs_scan() failed with code %d\n", (int)err); + return -EFAULT; + } +} +EXPORT_SYMBOL(bpf_scan_tfwstr); + +#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 18, 0) +/* Based on code taken from net/core/filter.c */ +/*static void *bpf_xdp_pointer(const struct xdp_buff *xdp, u32 offset, u32 len) +{ + u32 size = xdp->data_end - xdp->data; + void *addr = xdp->data; + + if (unlikely(offset > 0xffff || len > 0xffff)) + return ERR_PTR(-EFAULT); + + if (offset + len > size) + return ERR_PTR(-EINVAL); + + return addr + offset; +}*/ +#else +#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 1, 0) +/* This code is taken from net/core/filter.c */ +static void *bpf_xdp_pointer(const struct xdp_buff *xdp, u32 offset, u32 len) +{ + u32 size = xdp->data_end - xdp->data; + void *addr = xdp->data; + struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); + int i; + + if (unlikely(offset > 0xffff || len > 0xffff)) + return ERR_PTR(-EFAULT); + + if (offset + len > xdp_get_buff_len(xdp)) + return ERR_PTR(-EINVAL); + + if (offset < size) /* linear area */ + goto out; + + offset -= size; + for (i = 0; i < sinfo->nr_frags; i++) { /* paged area */ + u32 frag_size = skb_frag_size(&sinfo->frags[i]); + + if (offset < frag_size) { + addr = skb_frag_address(&sinfo->frags[i]); + size = frag_size; + break; + } + offset -= frag_size; + } +out: + return offset + len < size ? addr + offset : NULL; +} +#endif +#endif + +/*int bpf_xdp_scan_bytes(const struct xdp_md *xdp_md, u32 offset, u32 len, + struct rex_scan_attr *scan_attr) +{ + struct xdp_buff *xdp = (struct xdp_buff *)xdp_md; + void *ptr = bpf_xdp_pointer(xdp, offset, len); + + if (IS_ERR(ptr)) + return PTR_ERR(ptr); + + if (likely(ptr)) + return bpf_scan_bytes(ptr, len, scan_attr); + else + return -ENOTSUPP; +} +EXPORT_SYMBOL(bpf_xdp_scan_bytes); + +BTF_SET_START(rex_kfunc_ids) +BTF_ID(func, bpf_scan_bytes) +BTF_ID(func, bpf_xdp_scan_bytes) +BTF_SET_END(rex_kfunc_ids) +static DEFINE_KFUNC_BTF_ID_SET(&rex_kfunc_ids, rex_kfunc_btf_set);*/ + +static struct rex_policy *to_policy(struct config_item *item) +{ + return item ? container_of(item, struct rex_policy, item) : NULL; +} + +static ssize_t rexcfg_database_read(struct config_item *item, void *outbuf, + size_t size) +{ + struct rex_policy *rex = to_policy(item); + struct rex_database *db; + char *bytes = outbuf; + ssize_t ret; + + rcu_read_lock(); + db = rcu_dereference(rex->database); + + if (!bytes) { + /* In first call return size for te buffer. */ + if (hs_database_size(patterns(db), &ret)) + ret = 0; + } else if (size > 0) { + /* In second call fill the buffer with data. + * We have to check size again to avoid races. + */ + if (hs_database_size(patterns(db), &ret) || ret != size) { + ret = -ETXTBSY; + goto out; + } + + if (hs_serialize_database(patterns(db), &bytes, NULL)) { + WARN(1, "hs_serialize_database() failed\n"); + ret = -EIO; + } + + /* Check that pointer wasn't overwritten. */ + BUG_ON(bytes != outbuf); + } else { + return 0; + } + +out: + rcu_read_unlock(); + return ret; +} + +static void rex_assign_database(struct rex_policy *rex, struct rex_database *db) +{ + db = rcu_replace_pointer(rex->database, db, + lockdep_is_held(&rex_config_mutex)); + rex->epoch += 1; + + if (db) { + synchronize_rcu(); + free_percpu(db->scratch); + kfree(db); + } +} + +static ssize_t rexcfg_database_write(struct config_item *item, + const void *bytes, size_t nbytes) +{ + struct rex_policy *rex = to_policy(item); + struct rex_database *db; + hs_scratch_t *proto = NULL; + size_t alloc_size; + int cpu; + + /* Drop existing database on empty write. */ + if (nbytes == 0) { + mutex_lock(&rex_config_mutex); + rex_assign_database(rex, NULL); + mutex_unlock(&rex_config_mutex); + return nbytes; + } + + if (hs_serialized_database_size(bytes, nbytes, &alloc_size)) + return -EIO; + + db = kmalloc(sizeof(*db) + alloc_size, GFP_KERNEL); + if (!db) + return -ENOMEM; + + if (hs_deserialize_database_at(bytes, nbytes, patterns(db))) { + kfree(db); + return -EINVAL; + } + + if (hs_alloc_scratch(patterns(db), &proto)) { + kfree(db); + return -ENOMEM; + } + + BUG_ON(hs_scratch_size(proto, &alloc_size)); + db->scratch = __alloc_percpu(alloc_size, 64); + if (!db->scratch) { + kfree(db); + hs_free_scratch(proto); + return -ENOMEM; + } + + for_each_possible_cpu(cpu) { + hs_scratch_t *dst = per_cpu_ptr(db->scratch, cpu); + + BUG_ON(hs_init_scratch(proto, dst)); + } + hs_free_scratch(proto); + + mutex_lock(&rex_config_mutex); + rex_assign_database(rex, db); + mutex_unlock(&rex_config_mutex); + + return nbytes; +} + +static ssize_t rexcfg_info_show(struct config_item *item, char *str) +{ + struct rex_policy *rex = to_policy(item); + struct rex_database *db; + char *info; + int ret = 0; + + rcu_read_lock(); + + db = rcu_dereference(rex->database); + if (hs_database_info(patterns(db), &info)) { + ret = -EIO; + goto out; + } + + ret += sysfs_emit_at(str, ret, "%s\n", info); + kfree(info); + +out: + rcu_read_unlock(); + return ret; +} + +static ssize_t rexcfg_epoch_show(struct config_item *item, char *str) +{ + return snprintf(str, PAGE_SIZE, "%d\n", to_policy(item)->epoch); +} + +static ssize_t rexcfg_id_show(struct config_item *item, char *str) +{ + return snprintf(str, PAGE_SIZE, "%d\n", to_policy(item)->id); +} + +static ssize_t rexcfg_id_store(struct config_item *item, const char *str, + size_t length) +{ + struct rex_policy *rex = to_policy(item); + int ret, new_id; + + ret = kstrtoint(str, 0, &new_id); + if (ret < 0) + return -EINVAL; + + mutex_lock(&rex_config_mutex); + + if (rex->id == new_id) { + ret = length; + goto out; + } + + ret = idr_alloc(&rex_idr, rex, new_id, new_id + 1, GFP_KERNEL); + if (ret < 0) + goto out; + + BUG_ON(idr_remove(&rex_idr, rex->id) != rex); + rex->id = new_id; + ret = length; + +out: + mutex_unlock(&rex_config_mutex); + return ret; +} + +static ssize_t rexcfg_note_show(struct config_item *item, char *str) +{ + struct rex_policy *rex = to_policy(item); + int ret; + + mutex_lock(&rex->lock); + ret = snprintf(str, PAGE_SIZE, "%s", to_policy(item)->note); + mutex_unlock(&rex->lock); + + return ret; +} + +static ssize_t rexcfg_note_store(struct config_item *item, const char *str, + size_t length) +{ + struct rex_policy *rex = to_policy(item); + + mutex_lock(&rex->lock); + strncpy(rex->note, str, length); + mutex_unlock(&rex->lock); + + return length; +} + +/* Our subsystem hierarchy is: + * + * /sys/kernel/config/rex/ + * | + * / + * | id (rw) + * | database (rw) + * | epoch (ro) + * | info (ro) + * | note (rw) + * | + * /... + */ + +CONFIGFS_BIN_ATTR(rexcfg_, database, NULL, 0); +CONFIGFS_ATTR_RO(rexcfg_, epoch); +CONFIGFS_ATTR_RO(rexcfg_, info); +CONFIGFS_ATTR(rexcfg_, id); +CONFIGFS_ATTR(rexcfg_, note); + +static void rexcfg_item_release(struct config_item *item) +{ + struct rex_policy *rex = to_policy(item); + + mutex_lock(&rex_config_mutex); + BUG_ON(idr_remove(&rex_idr, rex->id) != rex); + rex_assign_database(rex, NULL); + mutex_unlock(&rex_config_mutex); +} + +static const struct config_item_type rex_type = { + .ct_owner = THIS_MODULE, + .ct_attrs = (struct configfs_attribute *[]){ &rexcfg_attr_id, + &rexcfg_attr_info, + &rexcfg_attr_epoch, + &rexcfg_attr_note, NULL }, + .ct_bin_attrs = + (struct configfs_bin_attribute *[]){ + &rexcfg_attr_database, + NULL, + }, + .ct_item_ops = + &(struct configfs_item_operations){ + .release = rexcfg_item_release, + } +}; + +static struct config_item *rex_make_item(struct config_group *group, + const char *name) +{ + struct rex_policy *rex; + int id; + + rex = kzalloc(sizeof(*rex), GFP_KERNEL); + if (!rex) + return ERR_PTR(-ENOMEM); + + mutex_lock(&rex_config_mutex); + + /* Patch database attribute type */ + rexcfg_attr_database.cb_max_size = max_db_size; + config_item_init_type_name(&rex->item, name, &rex_type); + + id = idr_alloc(&rex_idr, rex, 0, U32_MAX, GFP_KERNEL); + if (id < 0) { + kfree(rex); + return ERR_PTR(id); + } + rex->id = id; + + mutex_unlock(&rex_config_mutex); + + return &rex->item; +} + +static const struct config_item_type rex_group_type = { + .ct_owner = THIS_MODULE, + .ct_group_ops = + &(struct configfs_group_operations){ + .make_item = rex_make_item, + }, +}; + +static struct configfs_subsystem rex_configfs = { + .su_mutex = __MUTEX_INITIALIZER(rex_configfs.su_mutex), + .su_group = + { + .cg_item = + { + .ci_namebuf = "rex", + .ci_type = &rex_group_type, + }, + }, +}; + +static void banner(void) +{ + pr_info("Hyperscan %s\n", hs_version()); +} + +static int __init rex_init(void) +{ + int err; + + config_group_init(&rex_configfs.su_group); + err = configfs_register_subsystem(&rex_configfs); + if (err) + return err; + + //register_btf_kfunc_id_set(&prog_test_kfunc_list, &rex_kfunc_btf_set); + + banner(); + return 0; +} + +static void __exit rex_exit(void) +{ + //unregister_kfunc_btf_id_set(&prog_test_kfunc_list, &rex_kfunc_btf_set); + configfs_unregister_subsystem(&rex_configfs); + WARN_ON(!idr_is_empty(&rex_idr)); + idr_destroy(&rex_idr); +} + +module_init(rex_init); +module_exit(rex_exit); + +/* Module information */ +MODULE_AUTHOR("Sergey Nizovtsev, sn@tempesta-tech.com"); +MODULE_DESCRIPTION("Hyperscan regex engine"); +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/regex/kmod/rex.h b/regex/kmod/rex.h new file mode 100644 index 000000000..2f08d1394 --- /dev/null +++ b/regex/kmod/rex.h @@ -0,0 +1,82 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ +/* SPDX-FileCopyrightText: Copyright 2022 G-Core Labs S.A. */ + +#ifndef REX_ABI_USER_H +#define REX_ABI_USER_H + +#if !defined(__bpf__) +#include +#include +#define __ksym +#endif + +#include "fw/str.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * Structure describing a match event. + */ +struct rex_event { + unsigned int expression; + unsigned long long from; + unsigned long long to; + unsigned long long flags; +}; + +/* handler_flags */ +enum { + REX_SINGLE_SHOT = 1 << 0, +}; + +/** + * Attributes for bpf_scan_bytes() and bpf_xdp_scan_bytes(). + * + * @database_id: Numeric database handle taken from configfs (in). + * @handler_flags: Customize match handler behaviour (in). + * @event_count: Output number of events (inout). + * @last_event: Space to store match details. (out). + */ +struct rex_scan_attr { + __u32 database_id; + __u32 handler_flags; + __u32 nr_events; + struct rex_event last_event; +}; + +#if defined(__KERNEL__) || defined(__bpf__) + +/** + * Scan any buffer against regex pattern database. + * + * @buf: A pointer to a valid buffer. + * @buf__sz: Number of bytes to scan. + * @scan_attr: Input/output match attributes. + */ +int bpf_scan_bytes(const void *buf, __u32 buf__sz, + struct rex_scan_attr *scan_attr) __ksym; + +int bpf_scan_vector(const char *const *buf, const unsigned int *length, + __u32 buf__sz, struct rex_scan_attr *attr) __ksym; + +int bpf_scan_tfwstr(const TfwStr *str, struct rex_scan_attr *attr) __ksym; + +/** + * Scan @len packet bytes starting from @offset against pattern database. + * Similar to bpf_scan_bytes() but use XDP offsets to trick BPF verifier + * + * @xdp_md: A pointer to struct xdp_buff* actually. + * @scan_attr: Input/output match attributes. + */ +//int bpf_xdp_scan_bytes(const struct xdp_md *xdp_md, __u32 offset, __u32 len, +// struct rex_scan_attr *scan_attr) __ksym; + +#endif /* __KERNEL__ or __bpf__ */ + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif // REX_ABI_USER_H diff --git a/regex/kmod/rex_trace.h b/regex/kmod/rex_trace.h new file mode 100644 index 000000000..c7d5e943d --- /dev/null +++ b/regex/kmod/rex_trace.h @@ -0,0 +1,37 @@ +/* SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause */ +/* SPDX-FileCopyrightText: Copyright 2022 G-Core Labs S.A. */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM rex + +#if !defined(_TRACE_REX_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_REX_H + +#include +#include "rex.h" + +TRACE_EVENT(rex_match, TP_PROTO(struct rex_scan_attr *ctx), + + TP_ARGS(ctx), + + TP_STRUCT__entry(__field(__u32, database_id) __field(__u32, + event_index) + __field_struct(struct rex_event, event)), + + TP_fast_assign(__entry->database_id = ctx->database_id; + __entry->event_index = ctx->nr_events; + __entry->event = ctx->last_event;), + + TP_printk("regex=%u/%u at [%llu, %llu]", __entry->database_id, + __entry->event.expression, __entry->event.from, + __entry->event.to)); + +#endif /* _TRACE_REX_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE rex_trace + +/* This part must be outside protection */ +#include diff --git a/regex/kmod/ue2common_kern.h b/regex/kmod/ue2common_kern.h new file mode 100644 index 000000000..5b91b916a --- /dev/null +++ b/regex/kmod/ue2common_kern.h @@ -0,0 +1,106 @@ +#ifndef UE2COMMON_KERN_H +#define UE2COMMON_KERN_H + +#include "config.h" + +#ifndef pr_fmt +#define pr_fmt(fmt) "hyperscan:%s: " fmt, __func__ +#endif + +/* standard types used across ue2 */ + +/* We use the size_t type all over the place, usually defined in stddef.h. */ +#include +/* stdint.h for things like uintptr_t and friends */ +#include +#include +#include +#include +#include +#include +#include + +/* Linux kernel synonyms */ +#define FALLTHROUGH fallthrough +#define ALIGN_ATTR(x) __aligned(x) +#define ARRAY_LENGTH(a) ARRAY_SIZE(a) +#define UNUSED __always_unused +#define HS_PUBLIC_API /* nothing */ + +#define ALIGN_DIRECTIVE __aligned(16) +#define ALIGN_AVX_DIRECTIVE __aligned(32) +#define ALIGN_CL_DIRECTIVE __aligned(64) + +/* We append the 'a' for aligned, since these aren't common, garden variety + * 64 bit values. The alignment is necessary for structs on some platforms, + * so we don't end up performing accidental unaligned accesses. */ +typedef u64 __aligned(8) u64a; +typedef s64 __aligned(8) s64a; + +/* get the SIMD types */ +#include "util/simd_types.h" + +/** \brief Report identifier, used for internal IDs and external IDs (those + * reported on match). */ +typedef u32 ReportID; + +/** \brief Shorthand for the attribute to shut gcc about unused parameters */ + +/* really_inline forces inlining always */ +#if defined(HS_OPTIMIZE) +#define really_inline __always_inline __maybe_unused +#else +#define really_inline __maybe_unused +#endif + +/** no, seriously, inline it, even if building in debug mode */ +#define really_really_inline __always_inline __maybe_unused +#define never_inline noinline +#define alignof __alignof + +/* We use C99-style "restrict". */ +#define restrict __restrict + +/* Align to 16-byte boundary */ +#define ROUNDUP_16(a) (((a) + 0xf) & ~0xf) +#define ROUNDDOWN_16(a) ((a) & ~0xf) + +/* Align to N-byte boundary */ +#define ROUNDUP_N(a, n) (((a) + ((n)-1)) & ~((n)-1)) +#define ROUNDDOWN_N(a, n) ((a) & ~((n)-1)) + +/* Align to a cacheline - assumed to be 64 bytes */ +#define ROUNDUP_CL(a) ROUNDUP_N(a, 64) + +/* Align ptr to next N-byte boundary */ +#define ROUNDUP_PTR(ptr, n) (__typeof__(ptr))(ROUNDUP_N((uintptr_t)(ptr), (n))) +#define ROUNDDOWN_PTR(ptr, n) \ + (__typeof__(ptr))(ROUNDDOWN_N((uintptr_t)(ptr), (n))) + +#define ISALIGNED_N(ptr, n) (((uintptr_t)(ptr) & ((n)-1)) == 0) +#define ISALIGNED_16(ptr) ISALIGNED_N((ptr), 16) +#define ISALIGNED_CL(ptr) ISALIGNED_N((ptr), 64) +#define ISALIGNED(ptr) ISALIGNED_N((ptr), alignof(__typeof__(*(ptr)))) +#define N_CHARS 256 + +/* Maximum offset representable in the 'unsigned long long' we use to return + offset values. */ +#define MAX_OFFSET 0xffffffffffffffffULL + +#if 0 +/* Produces lots of warnings about implicit integer casts */ +#define MIN min +#define MAX max +#else +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +#define MAX(a, b) ((a) > (b) ? (a) : (b)) +#endif + +#define LIMIT_TO_AT_MOST(a, b) (*(a) = MIN(*(a), (b))) +#define ENSURE_AT_LEAST(a, b) (*(a) = MAX(*(a), (b))) + +#define DEBUG_PRINTF(fmt, ...) pr_debug(fmt, ##__VA_ARGS__) + +#define assert(cond) BUG_ON(!(cond)) + +#endif diff --git a/regex/nfa/accel.c b/regex/nfa/accel.c new file mode 100644 index 000000000..2bc60945f --- /dev/null +++ b/regex/nfa/accel.c @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "accel.h" +#include "shufti.h" +#include "truffle.h" +#include "vermicelli.h" +#include "ue2common.h" + +const u8 *run_accel(const union AccelAux *accel, const u8 *c, const u8 *c_end) { + assert(ISALIGNED_N(accel, alignof(union AccelAux))); + const u8 *rv; + + switch (accel->accel_type) { + case ACCEL_NONE: + DEBUG_PRINTF("accel none %p %p\n", c, c_end); + return c; + + case ACCEL_VERM: + DEBUG_PRINTF("accel verm %p %p\n", c, c_end); + if (c + 15 >= c_end) { + return c; + } + + rv = vermicelliExec(accel->verm.c, 0, c, c_end); + break; + + case ACCEL_VERM_NOCASE: + DEBUG_PRINTF("accel verm nc %p %p\n", c, c_end); + if (c + 15 >= c_end) { + return c; + } + + rv = vermicelliExec(accel->verm.c, 1, c, c_end); + break; + + case ACCEL_DVERM: + DEBUG_PRINTF("accel dverm %p %p\n", c, c_end); + if (c + 16 + 1 >= c_end) { + return c; + } + + /* need to stop one early to get an accurate end state */ + rv = vermicelliDoubleExec(accel->dverm.c1, accel->dverm.c2, 0, c, + c_end - 1); + break; + + case ACCEL_DVERM_NOCASE: + DEBUG_PRINTF("accel dverm nc %p %p\n", c, c_end); + if (c + 16 + 1 >= c_end) { + return c; + } + + /* need to stop one early to get an accurate end state */ + rv = vermicelliDoubleExec(accel->dverm.c1, accel->dverm.c2, 1, c, + c_end - 1); + break; + + case ACCEL_DVERM_MASKED: + DEBUG_PRINTF("accel dverm masked %p %p\n", c, c_end); + if (c + 16 + 1 >= c_end) { + return c; + } + + /* need to stop one early to get an accurate end state */ + rv = vermicelliDoubleMaskedExec(accel->dverm.c1, accel->dverm.c2, + accel->dverm.m1, accel->dverm.m2, + c, c_end - 1); + break; + + case ACCEL_SHUFTI: + DEBUG_PRINTF("accel shufti %p %p\n", c, c_end); + if (c + 15 >= c_end) { + return c; + } + + rv = shuftiExec(accel->shufti.lo, accel->shufti.hi, c, c_end); + break; + + case ACCEL_TRUFFLE: + DEBUG_PRINTF("accel Truffle %p %p\n", c, c_end); + if (c + 15 >= c_end) { + return c; + } + + rv = truffleExec(accel->truffle.mask1, accel->truffle.mask2, c, c_end); + break; + + case ACCEL_DSHUFTI: + DEBUG_PRINTF("accel dshufti %p %p\n", c, c_end); + if (c + 15 + 1 >= c_end) { + return c; + } + + /* need to stop one early to get an accurate end state */ + rv = shuftiDoubleExec(accel->dshufti.lo1, + accel->dshufti.hi1, + accel->dshufti.lo2, + accel->dshufti.hi2, c, c_end - 1); + break; + + case ACCEL_RED_TAPE: + DEBUG_PRINTF("accel red tape %p %p\n", c, c_end); + rv = c_end; + break; + + + default: + assert(!"not here"); + return c; + } + + DEBUG_PRINTF("adjusting for offset %u\n", accel->generic.offset); + /* adjust offset to take into account the offset */ + rv = MAX(c + accel->generic.offset, rv); + rv -= accel->generic.offset; + + DEBUG_PRINTF("advanced %zd\n", rv - c); + + return rv; +} diff --git a/regex/nfa/accel.h b/regex/nfa/accel.h new file mode 100644 index 000000000..3a03d0596 --- /dev/null +++ b/regex/nfa/accel.h @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Acceleration: data structures and common definitions. + */ + +#ifndef ACCEL_H +#define ACCEL_H + +#include "ue2common.h" + +/* run time defs */ +#define BAD_ACCEL_DIST 4 +#define SMALL_ACCEL_PENALTY 8 +#define BIG_ACCEL_PENALTY 32 + +/// Minimum length of the scan buffer for us to attempt acceleration. +#define ACCEL_MIN_LEN 16 + +enum AccelType { + ACCEL_NONE, + ACCEL_VERM, + ACCEL_VERM_NOCASE, + ACCEL_DVERM, + ACCEL_DVERM_NOCASE, + ACCEL_RVERM, + ACCEL_RVERM_NOCASE, + ACCEL_RDVERM, + ACCEL_RDVERM_NOCASE, + ACCEL_REOD, + ACCEL_REOD_NOCASE, + ACCEL_RDEOD, + ACCEL_RDEOD_NOCASE, + ACCEL_SHUFTI, + ACCEL_DSHUFTI, + ACCEL_TRUFFLE, + ACCEL_RED_TAPE, + ACCEL_DVERM_MASKED, +}; + +/** \brief Structure for accel framework. */ +union AccelAux { + u8 accel_type; + struct { + u8 accel_type; + u8 offset; + } generic; + struct { + u8 accel_type; + u8 offset; + u8 c; // uppercase if nocase + } verm; + struct { + u8 accel_type; + u8 offset; + u8 c1; // uppercase if nocase + u8 c2; // uppercase if nocase + u8 m1; // masked variant + u8 m2; // masked variant + } dverm; + struct { + u8 accel_type; + u8 offset; + u8 c; // uppercase if nocase + u8 len; + } mverm; + struct { + u8 accel_type; + u8 offset; + u8 c; // uppercase if nocase + u8 len1; + u8 len2; + } mdverm; + struct { + u8 accel_type; + u8 offset; + m128 lo; + m128 hi; + } shufti; + struct { + u8 accel_type; + u8 offset; + m128 lo1; + m128 hi1; + m128 lo2; + m128 hi2; + } dshufti; + struct { + u8 accel_type; + u8 offset; + m128 mask1; + m128 mask2; + } truffle; +}; + +/** + * Runs the specified acceleration scheme between c and c_end, returns a point + * such that the acceleration scheme does not match before. + */ +const u8 *run_accel(const union AccelAux *accel, const u8 *c, const u8 *c_end); + +#endif diff --git a/regex/nfa/callback.h b/regex/nfa/callback.h new file mode 100644 index 000000000..9bdaa8d14 --- /dev/null +++ b/regex/nfa/callback.h @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2015-2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief NFA Callback definitions, used at runtime. + */ + +#ifndef NFA_CALLBACK_H +#define NFA_CALLBACK_H + +#include "ue2common.h" + +/** \brief The type for an NFA callback. + * + * This is a function that takes as arguments the current start and end offsets + * where the match occurs, the id of the match and the context pointer that was + * passed into the NFA API function that executed the NFA. + * + * The start offset is the "start of match" (SOM) offset for the match. It is + * only provided by engines that natively support SOM tracking (e.g. Gough). + * + * The end offset will be the offset after the character that caused the match. + * Thus, if we have a buffer containing 'abc', then a pattern that matches an + * empty string will have an offset of 0, a pattern that matches 'a' will have + * an offset of 1, and a pattern that matches 'abc' will have an offset of 3, + * which will be a value that is 'beyond' the size of the buffer. That is, if + * we have n characters in the buffer, there are n+1 different potential + * offsets for matches. + * + * This function should return an int - currently the possible return values + * are 0, which means 'stop running the engine' or non-zero, which means + * 'continue matching'. + */ +typedef int (*NfaCallback)(u64a start, u64a end, ReportID id, void *context); + +/** + * standard \ref NfaCallback return value indicating that engine execution + * should continue. (any non-zero value will serve this purpose) + */ +#define MO_CONTINUE_MATCHING 1 + +/** + * \ref NfaCallback return value indicating that engine execution should halt. + */ +#define MO_HALT_MATCHING 0 + +#endif // NFA_CALLBACK_H diff --git a/regex/nfa/castle.c b/regex/nfa/castle.c new file mode 100644 index 000000000..7c158b31c --- /dev/null +++ b/regex/nfa/castle.c @@ -0,0 +1,1149 @@ +/* + * Copyright (c) 2015-2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Castle: multi-tenant repeat engine, runtime code. + */ + +#include "castle.h" + +#include "castle_internal.h" +#include "nfa_api.h" +#include "nfa_api_queue.h" +#include "nfa_internal.h" +#include "repeat.h" +#include "shufti.h" +#include "truffle.h" +#include "vermicelli.h" +#include "util/bitutils.h" +#include "util/multibit.h" +#include "util/partial_store.h" +#include "ue2common.h" + +static really_inline +const struct SubCastle *getSubCastle(const struct Castle *c, u32 num) { + assert(num < c->numRepeats); + const struct SubCastle *sub = + (const struct SubCastle *)((const char *)c + sizeof(struct Castle)); + assert(ISALIGNED(sub)); + return &sub[num]; +} + +static really_inline +const struct RepeatInfo *getRepeatInfo(const struct SubCastle *sub) { + const struct RepeatInfo *repeatInfo = + (const struct RepeatInfo *)((const char *)sub + sub->repeatInfoOffset); + return repeatInfo; +} + +static really_inline +union RepeatControl *getControl(char *full_state, const struct SubCastle *sub) { + union RepeatControl *rctrl = + (union RepeatControl *)(full_state + sub->fullStateOffset); + assert(ISALIGNED(rctrl)); + return rctrl; +} + +static really_inline +const union RepeatControl *getControlConst(const char *full_state, + const struct SubCastle *sub) { + const union RepeatControl *rctrl = + (const union RepeatControl *)(full_state + sub->fullStateOffset); + assert(ISALIGNED(rctrl)); + return rctrl; +} + +enum MatchMode { + CALLBACK_OUTPUT, + STOP_AT_MATCH, +}; + +static really_inline +char subCastleReportCurrent(const struct Castle *c, struct mq *q, + const u64a offset, const u32 subIdx) { + const struct SubCastle *sub = getSubCastle(c, subIdx); + const struct RepeatInfo *info = getRepeatInfo(sub); + + union RepeatControl *rctrl = getControl(q->state, sub); + char *rstate = (char *)q->streamState + sub->streamStateOffset + + info->packedCtrlSize; + enum RepeatMatch match = + repeatHasMatch(info, rctrl, rstate, offset); + DEBUG_PRINTF("repeatHasMatch returned %d\n", match); + if (match == REPEAT_MATCH) { + DEBUG_PRINTF("firing match at %llu for sub %u, report %u\n", offset, + subIdx, sub->report); + if (q->cb(0, offset, sub->report, q->context) == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } + + return MO_CONTINUE_MATCHING; +} + +static really_inline +int castleReportCurrent(const struct Castle *c, struct mq *q) { + const u64a offset = q_cur_offset(q); + DEBUG_PRINTF("offset=%llu\n", offset); + + if (c->exclusive) { + u8 *active = (u8 *)q->streamState; + u8 *groups = active + c->groupIterOffset; + for (u32 i = mmbit_iterate(groups, c->numGroups, MMB_INVALID); + i != MMB_INVALID; i = mmbit_iterate(groups, c->numGroups, i)) { + u8 *cur = active + i * c->activeIdxSize; + const u32 activeIdx = partial_load_u32(cur, c->activeIdxSize); + DEBUG_PRINTF("subcastle %u\n", activeIdx); + if (subCastleReportCurrent(c, q, + offset, activeIdx) == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } + } + + if (c->exclusive != PURE_EXCLUSIVE) { + const u8 *active = (const u8 *)q->streamState + c->activeOffset; + for (u32 i = mmbit_iterate(active, c->numRepeats, MMB_INVALID); + i != MMB_INVALID; i = mmbit_iterate(active, c->numRepeats, i)) { + DEBUG_PRINTF("subcastle %u\n", i); + if (subCastleReportCurrent(c, q, offset, i) == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } + } + + return MO_CONTINUE_MATCHING; +} + +static really_inline +char subCastleInAccept(const struct Castle *c, struct mq *q, + const ReportID report, const u64a offset, + const u32 subIdx) { + const struct SubCastle *sub = getSubCastle(c, subIdx); + + if (sub->report != report) { + return 0; + } + const struct RepeatInfo *info = getRepeatInfo(sub); + + union RepeatControl *rctrl = getControl(q->state, sub); + char *rstate = (char *)q->streamState + sub->streamStateOffset + + info->packedCtrlSize; + enum RepeatMatch match = + repeatHasMatch(info, rctrl, rstate, offset); + if (match == REPEAT_MATCH) { + DEBUG_PRINTF("in an accept\n"); + return 1; + } + + return 0; +} + +static really_inline +char castleInAccept(const struct Castle *c, struct mq *q, + const ReportID report, const u64a offset) { + DEBUG_PRINTF("offset=%llu\n", offset); + /* ignore when just catching up due to full queue */ + if (report == MO_INVALID_IDX) { + return 0; + } + + if (c->exclusive) { + u8 *active = (u8 *)q->streamState; + u8 *groups = active + c->groupIterOffset; + for (u32 i = mmbit_iterate(groups, c->numGroups, MMB_INVALID); + i != MMB_INVALID; i = mmbit_iterate(groups, c->numGroups, i)) { + u8 *cur = active + i * c->activeIdxSize; + const u32 activeIdx = partial_load_u32(cur, c->activeIdxSize); + DEBUG_PRINTF("subcastle %u\n", activeIdx); + if (subCastleInAccept(c, q, report, offset, activeIdx)) { + return 1; + } + } + } + + if (c->exclusive != PURE_EXCLUSIVE) { + const u8 *active = (const u8 *)q->streamState + c->activeOffset; + for (u32 i = mmbit_iterate(active, c->numRepeats, MMB_INVALID); + i != MMB_INVALID; i = mmbit_iterate(active, c->numRepeats, i)) { + DEBUG_PRINTF("subcastle %u\n", i); + if (subCastleInAccept(c, q, report, offset, i)) { + return 1; + } + } + } + + return 0; +} + +static really_inline +void subCastleDeactivateStaleSubs(const struct Castle *c, const u64a offset, + void *full_state, void *stream_state, + const u32 subIdx) { + const struct SubCastle *sub = getSubCastle(c, subIdx); + const struct RepeatInfo *info = getRepeatInfo(sub); + + union RepeatControl *rctrl = getControl(full_state, sub); + char *rstate = (char *)stream_state + sub->streamStateOffset + + info->packedCtrlSize; + + if (repeatHasMatch(info, rctrl, rstate, offset) == REPEAT_STALE) { + DEBUG_PRINTF("sub %u is stale at offset %llu\n", subIdx, offset); + if (sub->exclusiveId < c->numRepeats) { + u8 *active = (u8 *)stream_state; + u8 *groups = active + c->groupIterOffset; + mmbit_unset(groups, c->numGroups, sub->exclusiveId); + } else { + u8 *active = (u8 *)stream_state + c->activeOffset; + mmbit_unset(active, c->numRepeats, subIdx); + } + } +} + +static really_inline +void castleDeactivateStaleSubs(const struct Castle *c, const u64a offset, + void *full_state, void *stream_state) { + DEBUG_PRINTF("offset=%llu\n", offset); + + if (!c->staleIterOffset) { + DEBUG_PRINTF("{no repeats can go stale}\n"); + return; /* no subcastle can ever go stale */ + } + + if (c->exclusive) { + u8 *active = (u8 *)stream_state; + u8 *groups = active + c->groupIterOffset; + for (u32 i = mmbit_iterate(groups, c->numGroups, MMB_INVALID); + i != MMB_INVALID; i = mmbit_iterate(groups, c->numGroups, i)) { + u8 *cur = active + i * c->activeIdxSize; + const u32 activeIdx = partial_load_u32(cur, c->activeIdxSize); + DEBUG_PRINTF("subcastle %u\n", activeIdx); + subCastleDeactivateStaleSubs(c, offset, full_state, + stream_state, activeIdx); + } + } + + if (c->exclusive != PURE_EXCLUSIVE) { + const u8 *active = (const u8 *)stream_state + c->activeOffset; + const struct mmbit_sparse_iter *it + = (const void *)((const char *)c + c->staleIterOffset); + + struct mmbit_sparse_state si_state[MAX_SPARSE_ITER_STATES]; + u32 numRepeats = c->numRepeats; + u32 idx = 0; + + u32 i = mmbit_sparse_iter_begin(active, numRepeats, &idx, it, si_state); + while(i != MMB_INVALID) { + DEBUG_PRINTF("subcastle %u\n", i); + subCastleDeactivateStaleSubs(c, offset, full_state, stream_state, i); + i = mmbit_sparse_iter_next(active, numRepeats, i, &idx, it, + si_state); + } + } +} + +static really_inline +void castleProcessTop(const struct Castle *c, const u32 top, const u64a offset, + void *full_state, void *stream_state, + UNUSED char stale_checked) { + assert(top < c->numRepeats); + + const struct SubCastle *sub = getSubCastle(c, top); + const struct RepeatInfo *info = getRepeatInfo(sub); + union RepeatControl *rctrl = getControl(full_state, sub); + char *rstate = (char *)stream_state + sub->streamStateOffset + + info->packedCtrlSize; + + char is_alive = 0; + u8 *active = (u8 *)stream_state; + if (sub->exclusiveId < c->numRepeats) { + u8 *groups = active + c->groupIterOffset; + active += sub->exclusiveId * c->activeIdxSize; + if (mmbit_set(groups, c->numGroups, sub->exclusiveId)) { + const u32 activeIdx = partial_load_u32(active, c->activeIdxSize); + is_alive = (activeIdx == top); + } + + if (!is_alive) { + partial_store_u32(active, top, c->activeIdxSize); + } + } else { + active += c->activeOffset; + is_alive = mmbit_set(active, c->numRepeats, top); + } + + if (!is_alive) { + DEBUG_PRINTF("first top for inactive repeat %u\n", top); + } else { + DEBUG_PRINTF("repeat %u is already alive\n", top); + // Caller should ensure we're not stale. + assert(!stale_checked + || repeatHasMatch(info, rctrl, rstate, offset) != REPEAT_STALE); + + // Ignore duplicate top events. + u64a last = repeatLastTop(info, rctrl, rstate); + + assert(last <= offset); + if (last == offset) { + DEBUG_PRINTF("dupe top at %llu\n", offset); + return; + } + } + + repeatStore(info, rctrl, rstate, offset, is_alive); +} + +static really_inline +void subCastleFindMatch(const struct Castle *c, const u64a begin, + const u64a end, void *full_state, void *stream_state, + size_t *mloc, char *found, const u32 subIdx) { + const struct SubCastle *sub = getSubCastle(c, subIdx); + const struct RepeatInfo *info = getRepeatInfo(sub); + union RepeatControl *rctrl = getControl(full_state, sub); + char *rstate = (char *)stream_state + sub->streamStateOffset + + info->packedCtrlSize; + + u64a match = repeatNextMatch(info, rctrl, rstate, begin); + if (match == 0) { + DEBUG_PRINTF("no more matches for sub %u\n", subIdx); + if (sub->exclusiveId < c->numRepeats) { + u8 *groups = (u8 *)stream_state + c->groupIterOffset; + mmbit_unset(groups, c->numGroups, sub->exclusiveId); + } else { + u8 *active = (u8 *)stream_state + c->activeOffset; + mmbit_unset(active, c->numRepeats, subIdx); + } + return; + } else if (match > end) { + DEBUG_PRINTF("next match for sub %u at %llu is > horizon\n", subIdx, + match); + return; + } + DEBUG_PRINTF("sub %u earliest match at %llu\n", subIdx, match); + size_t diff = match - begin; + if (!(*found) || diff < *mloc) { + *mloc = diff; + DEBUG_PRINTF("mloc=%zu\n", *mloc); + } + *found = 1; +} + +static really_inline +char castleFindMatch(const struct Castle *c, const u64a begin, const u64a end, + void *full_state, void *stream_state, size_t *mloc) { + DEBUG_PRINTF("begin=%llu, end=%llu\n", begin, end); + assert(begin <= end); + + if (begin == end) { + DEBUG_PRINTF("no work to do\n"); + return 0; + } + + char found = 0; + *mloc = 0; + + if (c->exclusive) { + u8 *active = (u8 *)stream_state; + u8 *groups = active + c->groupIterOffset; + for (u32 i = mmbit_iterate(groups, c->numGroups, MMB_INVALID); + i != MMB_INVALID; i = mmbit_iterate(groups, c->numGroups, i)) { + u8 *cur = active + i * c->activeIdxSize; + const u32 activeIdx = partial_load_u32(cur, c->activeIdxSize); + DEBUG_PRINTF("subcastle %u\n", activeIdx); + subCastleFindMatch(c, begin, end, full_state, stream_state, mloc, + &found, activeIdx); + } + } + + if (c->exclusive != PURE_EXCLUSIVE) { + u8 *active = (u8 *)stream_state + c->activeOffset; + for (u32 i = mmbit_iterate(active, c->numRepeats, MMB_INVALID); + i != MMB_INVALID; + i = mmbit_iterate(active, c->numRepeats, i)) { + DEBUG_PRINTF("subcastle %u\n", i); + subCastleFindMatch(c, begin, end, full_state, stream_state, mloc, + &found, i); + } + } + + return found; +} + +static really_inline +u64a subCastleNextMatch(const struct Castle *c, void *full_state, + void *stream_state, const u64a loc, + const u32 subIdx) { + DEBUG_PRINTF("subcastle %u\n", subIdx); + const struct SubCastle *sub = getSubCastle(c, subIdx); + const struct RepeatInfo *info = getRepeatInfo(sub); + const union RepeatControl *rctrl = + getControlConst(full_state, sub); + const char *rstate = (const char *)stream_state + + sub->streamStateOffset + + info->packedCtrlSize; + + return repeatNextMatch(info, rctrl, rstate, loc); +} + +static really_inline +void set_matching(const struct Castle *c, const u64a match, u8 *active, + u8 *matching, const u32 active_size, const u32 active_id, + const u32 matching_id, u64a *offset, const u64a end) { + if (match == 0) { + DEBUG_PRINTF("no more matches\n"); + mmbit_unset(active, active_size, active_id); + } else if (match > end) { + // If we had a local copy of the active mmbit, we could skip + // looking at this repeat again. But we don't, so we just move + // on. + } else if (match == *offset) { + mmbit_set(matching, c->numRepeats, matching_id); + } else if (match < *offset) { + // New minimum offset. + *offset = match; + mmbit_clear(matching, c->numRepeats); + mmbit_set(matching, c->numRepeats, matching_id); + } +} + +static really_inline +void subCastleMatchLoop(const struct Castle *c, void *full_state, + void *stream_state, const u64a end, + const u64a loc, u64a *offset) { + u8 *active = (u8 *)stream_state + c->activeOffset; + u8 *matching = full_state; + for (u32 i = mmbit_iterate(active, c->numRepeats, MMB_INVALID); + i != MMB_INVALID; i = mmbit_iterate(active, c->numRepeats, i)) { + u64a match = subCastleNextMatch(c, full_state, stream_state, loc, i); + set_matching(c, match, active, matching, c->numRepeats, i, + i, offset, end); + } +} + +static really_inline +char subCastleFireMatch(const struct Castle *c, const void *full_state, + UNUSED const void *stream_state, NfaCallback cb, + void *ctx, const u64a offset) { + const u8 *matching = full_state; + + // Fire all matching sub-castles at this offset. + for (u32 i = mmbit_iterate(matching, c->numRepeats, MMB_INVALID); + i != MMB_INVALID; + i = mmbit_iterate(matching, c->numRepeats, i)) { + const struct SubCastle *sub = getSubCastle(c, i); + DEBUG_PRINTF("firing match at %llu for sub %u\n", offset, i); + if (cb(0, offset, sub->report, ctx) == MO_HALT_MATCHING) { + DEBUG_PRINTF("caller told us to halt\n"); + return MO_HALT_MATCHING; + } + } + + return MO_CONTINUE_MATCHING; +} + +static really_inline +char castleMatchLoop(const struct Castle *c, const u64a begin, const u64a end, + void *full_state, void *stream_state, NfaCallback cb, + void *ctx) { + DEBUG_PRINTF("begin=%llu, end=%llu\n", begin, end); + assert(begin <= end); + + u8 *matching = full_state; // temp multibit + + u64a loc = begin; + while (loc < end) { + + // Find minimum next offset for the next match(es) from amongst our + // active sub-castles, and store the indices of the sub-castles that + // match at that offset in the 'matching' mmbit, which is in the + // full_state (scratch). + + u64a offset = end; // min offset of next match + u32 activeIdx = 0; + mmbit_clear(matching, c->numRepeats); + if (c->exclusive) { + u8 *active = (u8 *)stream_state; + u8 *groups = active + c->groupIterOffset; + for (u32 i = mmbit_iterate(groups, c->numGroups, MMB_INVALID); + i != MMB_INVALID; i = mmbit_iterate(groups, c->numGroups, i)) { + u8 *cur = active + i * c->activeIdxSize; + activeIdx = partial_load_u32(cur, c->activeIdxSize); + u64a match = subCastleNextMatch(c, full_state, stream_state, + loc, activeIdx); + set_matching(c, match, groups, matching, c->numGroups, i, + activeIdx, &offset, end); + } + } + + if (c->exclusive != PURE_EXCLUSIVE) { + subCastleMatchLoop(c, full_state, stream_state, + end, loc, &offset); + } + DEBUG_PRINTF("offset=%llu\n", offset); + if (!mmbit_any(matching, c->numRepeats)) { + DEBUG_PRINTF("no more matches\n"); + break; + } + + if (subCastleFireMatch(c, full_state, stream_state, + cb, ctx, offset) == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + loc = offset; + } + + return MO_CONTINUE_MATCHING; +} + +static really_inline +char castleScanVerm(const struct Castle *c, const u8 *buf, const size_t begin, + const size_t end, size_t *loc) { + const u8 *ptr = vermicelliExec(c->u.verm.c, 0, buf + begin, buf + end); + if (ptr == buf + end) { + DEBUG_PRINTF("no escape found\n"); + return 0; + } + + assert(loc); + assert(ptr >= buf && ptr < buf + end); + *loc = (size_t)(ptr - buf); + DEBUG_PRINTF("escape found at offset %zu\n", *loc); + return 1; +} + +static really_inline +char castleScanNVerm(const struct Castle *c, const u8 *buf, const size_t begin, + const size_t end, size_t *loc) { + const u8 *ptr = nvermicelliExec(c->u.verm.c, 0, buf + begin, buf + end); + if (ptr == buf + end) { + DEBUG_PRINTF("no escape found\n"); + return 0; + } + + assert(loc); + assert(ptr >= buf && ptr < buf + end); + *loc = (size_t)(ptr - buf); + DEBUG_PRINTF("escape found at offset %zu\n", *loc); + return 1; +} + +static really_inline +char castleScanShufti(const struct Castle *c, const u8 *buf, const size_t begin, + const size_t end, size_t *loc) { + const m128 mask_lo = c->u.shuf.mask_lo; + const m128 mask_hi = c->u.shuf.mask_hi; + const u8 *ptr = shuftiExec(mask_lo, mask_hi, buf + begin, buf + end); + if (ptr == buf + end) { + DEBUG_PRINTF("no escape found\n"); + return 0; + } + + assert(loc); + assert(ptr >= buf && ptr < buf + end); + *loc = (size_t)(ptr - buf); + DEBUG_PRINTF("escape found at offset %zu\n", *loc); + return 1; +} + +static really_inline +char castleScanTruffle(const struct Castle *c, const u8 *buf, const size_t begin, + const size_t end, size_t *loc) { + const u8 *ptr = truffleExec(c->u.truffle.mask1, c->u.truffle.mask2, + buf + begin, buf + end); + if (ptr == buf + end) { + DEBUG_PRINTF("no escape found\n"); + return 0; + } + + assert(loc); + assert(ptr >= buf && ptr < buf + end); + *loc = (size_t)(ptr - buf); + DEBUG_PRINTF("escape found at offset %zu\n", *loc); + return 1; +} + +static really_inline +char castleScan(const struct Castle *c, const u8 *buf, const size_t begin, + const size_t end, size_t *loc) { + assert(begin <= end); + + if (begin == end) { + return 0; + } + + switch (c->type) { + case CASTLE_DOT: + // Nothing can stop a dot scan! + return 0; + case CASTLE_VERM: + return castleScanVerm(c, buf, begin, end, loc); + case CASTLE_NVERM: + return castleScanNVerm(c, buf, begin, end, loc); + case CASTLE_SHUFTI: + return castleScanShufti(c, buf, begin, end, loc); + case CASTLE_TRUFFLE: + return castleScanTruffle(c, buf, begin, end, loc); + default: + DEBUG_PRINTF("unknown scan type!\n"); + assert(0); + return 0; + } +} + +static really_inline +char castleRevScanVerm(const struct Castle *c, const u8 *buf, + const size_t begin, const size_t end, size_t *loc) { + const u8 *ptr = rvermicelliExec(c->u.verm.c, 0, buf + begin, buf + end); + if (ptr == buf + begin - 1) { + DEBUG_PRINTF("no escape found\n"); + return 0; + } + + assert(loc); + assert(ptr >= buf && ptr < buf + end); + *loc = (size_t)(ptr - buf); + DEBUG_PRINTF("escape found at offset %zu\n", *loc); + return 1; +} + +static really_inline +char castleRevScanNVerm(const struct Castle *c, const u8 *buf, + const size_t begin, const size_t end, size_t *loc) { + const u8 *ptr = rnvermicelliExec(c->u.verm.c, 0, buf + begin, buf + end); + if (ptr == buf + begin - 1) { + DEBUG_PRINTF("no escape found\n"); + return 0; + } + + assert(loc); + assert(ptr >= buf && ptr < buf + end); + *loc = (size_t)(ptr - buf); + DEBUG_PRINTF("escape found at offset %zu\n", *loc); + return 1; +} + +static really_inline +char castleRevScanShufti(const struct Castle *c, const u8 *buf, + const size_t begin, const size_t end, size_t *loc) { + const m128 mask_lo = c->u.shuf.mask_lo; + const m128 mask_hi = c->u.shuf.mask_hi; + const u8 *ptr = rshuftiExec(mask_lo, mask_hi, buf + begin, buf + end); + if (ptr == buf + begin - 1) { + DEBUG_PRINTF("no escape found\n"); + return 0; + } + + assert(loc); + assert(ptr >= buf && ptr < buf + end); + *loc = (size_t)(ptr - buf); + DEBUG_PRINTF("escape found at offset %zu\n", *loc); + return 1; +} + +static really_inline +char castleRevScanTruffle(const struct Castle *c, const u8 *buf, + const size_t begin, const size_t end, size_t *loc) { + const u8 *ptr = rtruffleExec(c->u.truffle.mask1, c->u.truffle.mask2, + buf + begin, buf + end); + if (ptr == buf + begin - 1) { + DEBUG_PRINTF("no escape found\n"); + return 0; + } + + assert(loc); + assert(ptr >= buf && ptr < buf + end); + *loc = (size_t)(ptr - buf); + DEBUG_PRINTF("escape found at offset %zu\n", *loc); + return 1; +} + +static really_inline +char castleRevScan(const struct Castle *c, const u8 *buf, const size_t begin, + const size_t end, size_t *loc) { + assert(begin <= end); + DEBUG_PRINTF("scanning backwards over (%zu,%zu]\n", begin, end); + if (begin == end) { + return 0; + } + + switch (c->type) { + case CASTLE_DOT: + // Nothing can stop a dot scan! + return 0; + case CASTLE_VERM: + return castleRevScanVerm(c, buf, begin, end, loc); + case CASTLE_NVERM: + return castleRevScanNVerm(c, buf, begin, end, loc); + case CASTLE_SHUFTI: + return castleRevScanShufti(c, buf, begin, end, loc); + case CASTLE_TRUFFLE: + return castleRevScanTruffle(c, buf, begin, end, loc); + default: + DEBUG_PRINTF("unknown scan type!\n"); + assert(0); + return 0; + } +} + +static really_inline +void castleHandleEvent(const struct Castle *c, struct mq *q, const u64a sp, + char stale_checked) { + const u32 event = q->items[q->cur].type; + switch (event) { + case MQE_TOP: + assert(0); // should be a numbered top + break; + case MQE_START: + case MQE_END: + break; + default: + assert(event >= MQE_TOP_FIRST); + assert(event < MQE_INVALID); + u32 top = event - MQE_TOP_FIRST; + DEBUG_PRINTF("top %u at offset %llu\n", top, sp); + castleProcessTop(c, top, sp, q->state, q->streamState, stale_checked); + break; + } +} + +static really_inline +void clear_repeats(const struct Castle *c, const struct mq *q, u8 *active) { + DEBUG_PRINTF("clearing active repeats due to escape\n"); + if (c->exclusive) { + u8 *groups = (u8 *)q->streamState + c->groupIterOffset; + mmbit_clear(groups, c->numGroups); + } + + if (c->exclusive != PURE_EXCLUSIVE) { + mmbit_clear(active, c->numRepeats); + } +} + +static really_inline +char nfaExecCastle_Q_i(const struct NFA *n, struct mq *q, s64a end, + enum MatchMode mode) { + assert(n && q); + assert(n->type == CASTLE_NFA); + + DEBUG_PRINTF("state=%p, streamState=%p\n", q->state, q->streamState); + + const struct Castle *c = getImplNfa(n); + + if (q->report_current) { + int rv = castleReportCurrent(c, q); + q->report_current = 0; + if (rv == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } + + if (q->cur == q->end) { + return 1; + } + + u8 *active = (u8 *)q->streamState + c->activeOffset;// active multibit + + assert(q->cur + 1 < q->end); // require at least two items + assert(q_cur_type(q) == MQE_START); + u64a sp = q_cur_offset(q); + q->cur++; + DEBUG_PRINTF("sp=%llu, abs_end=%llu\n", sp, end + q->offset); + + while (q->cur < q->end) { + DEBUG_PRINTF("q item type=%d offset=%llu\n", q_cur_type(q), + q_cur_offset(q)); + + char found = 0; + if (c->exclusive) { + u8 *groups = (u8 *)q->streamState + c->groupIterOffset; + found = mmbit_any(groups, c->numGroups); + } + + if (!found && !mmbit_any(active, c->numRepeats)) { + DEBUG_PRINTF("no repeats active, skipping scan\n"); + goto scan_done; + } + + u64a ep = q_cur_offset(q); + ep = MIN(ep, q->offset + end); + if (sp < ep) { + size_t eloc = 0; + char escape_found = 0; + DEBUG_PRINTF("scanning from sp=%llu to ep=%llu\n", sp, ep); + assert(sp >= q->offset && ep >= q->offset); + if (castleScan(c, q->buffer, sp - q->offset, ep - q->offset, + &eloc)) { + escape_found = 1; + ep = q->offset + eloc; + DEBUG_PRINTF("escape found at %llu\n", ep); + assert(ep >= sp); + } + + assert(sp <= ep); + + if (mode == STOP_AT_MATCH) { + size_t mloc; + if (castleFindMatch(c, sp, ep, q->state, q->streamState, + &mloc)) { + DEBUG_PRINTF("storing match at %llu\n", sp + mloc); + q->cur--; + assert(q->cur < MAX_MQE_LEN); + q->items[q->cur].type = MQE_START; + q->items[q->cur].location = (s64a)(sp - q->offset) + mloc; + return MO_MATCHES_PENDING; + } + } else { + assert(mode == CALLBACK_OUTPUT); + char rv = castleMatchLoop(c, sp, ep, q->state, q->streamState, + q->cb, q->context); + if (rv == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + assert(rv == MO_CONTINUE_MATCHING); + } + + if (escape_found) { + clear_repeats(c, q, active); + } + } + + scan_done: + if (q_cur_loc(q) > end) { + q->cur--; + assert(q->cur < MAX_MQE_LEN); + q->items[q->cur].type = MQE_START; + q->items[q->cur].location = end; + return MO_ALIVE; + } + + sp = q_cur_offset(q); + castleHandleEvent(c, q, sp, 1); + q->cur++; + } + + if (c->exclusive) { + u8 *groups = (u8 *)q->streamState + c->groupIterOffset; + if (mmbit_any_precise(groups, c->numGroups)) { + return 1; + } + } + + return mmbit_any_precise(active, c->numRepeats); +} + +char nfaExecCastle_Q(const struct NFA *n, struct mq *q, s64a end) { + DEBUG_PRINTF("entry\n"); + return nfaExecCastle_Q_i(n, q, end, CALLBACK_OUTPUT); +} + +char nfaExecCastle_Q2(const struct NFA *n, struct mq *q, s64a end) { + DEBUG_PRINTF("entry\n"); + return nfaExecCastle_Q_i(n, q, end, STOP_AT_MATCH); +} + +static +s64a castleLastKillLoc(const struct Castle *c, struct mq *q) { + assert(q_cur_type(q) == MQE_START); + assert(q_last_type(q) == MQE_END); + s64a sp = q_cur_loc(q); + s64a ep = q_last_loc(q); + + DEBUG_PRINTF("finding final squash in (%lld, %lld]\n", sp, ep); + + size_t loc; + + if (ep > 0) { + if (castleRevScan(c, q->buffer, sp > 0 ? sp : 0, ep, &loc)) { + return (s64a)loc; + } + ep = 0; + } + + if (sp < 0) { + s64a hlen = q->hlength; + + if (castleRevScan(c, q->history, sp + hlen, ep + hlen, &loc)) { + return (s64a)loc - hlen; + } + ep = 0; + } + + return sp - 1; /* the repeats are never killed */ +} + +char nfaExecCastle_QR(const struct NFA *n, struct mq *q, ReportID report) { + assert(n && q); + assert(n->type == CASTLE_NFA); + DEBUG_PRINTF("entry\n"); + + if (q->cur == q->end) { + return 1; + } + + assert(q->cur + 1 < q->end); /* require at least two items */ + assert(q_cur_type(q) == MQE_START); + + const struct Castle *c = getImplNfa(n); + u8 *active = (u8 *)q->streamState + c->activeOffset; + + u64a end_offset = q_last_loc(q) + q->offset; + s64a last_kill_loc = castleLastKillLoc(c, q); + DEBUG_PRINTF("all repeats killed at %lld (exec range %lld, %lld)\n", + last_kill_loc, q_cur_loc(q), q_last_loc(q)); + assert(last_kill_loc < q_last_loc(q)); + + if (last_kill_loc != q_cur_loc(q) - 1) { + clear_repeats(c, q, active); + } + + q->cur++; /* skip start event */ + + /* skip events prior to the repeats being squashed */ + while (q_cur_loc(q) <= last_kill_loc) { + DEBUG_PRINTF("skipping moot event at %lld\n", q_cur_loc(q)); + q->cur++; + assert(q->cur < q->end); + } + + while (q->cur < q->end) { + DEBUG_PRINTF("q item type=%d offset=%llu\n", q_cur_type(q), + q_cur_offset(q)); + u64a sp = q_cur_offset(q); + castleHandleEvent(c, q, sp, 0); + q->cur++; + } + + castleDeactivateStaleSubs(c, end_offset, q->state, q->streamState); + + char found = 0; + if (c->exclusive) { + u8 *groups = (u8 *)q->streamState + c->groupIterOffset; + found = mmbit_any_precise(groups, c->numGroups); + + } + + if (!found && !mmbit_any_precise(active, c->numRepeats)) { + DEBUG_PRINTF("castle is dead\n"); + return 0; + } + + if (castleInAccept(c, q, report, end_offset)) { + return MO_MATCHES_PENDING; + } + + return 1; +} + +char nfaExecCastle_reportCurrent(const struct NFA *n, struct mq *q) { + assert(n && q); + assert(n->type == CASTLE_NFA); + DEBUG_PRINTF("entry\n"); + + const struct Castle *c = getImplNfa(n); + castleReportCurrent(c, q); + return 0; +} + +char nfaExecCastle_inAccept(const struct NFA *n, ReportID report, + struct mq *q) { + assert(n && q); + assert(n->type == CASTLE_NFA); + DEBUG_PRINTF("entry\n"); + + const struct Castle *c = getImplNfa(n); + return castleInAccept(c, q, report, q_cur_offset(q)); +} + +char nfaExecCastle_inAnyAccept(const struct NFA *n, struct mq *q) { + assert(n && q); + assert(n->type == CASTLE_NFA); + DEBUG_PRINTF("entry\n"); + + const struct Castle *c = getImplNfa(n); + const u64a offset = q_cur_offset(q); + DEBUG_PRINTF("offset=%llu\n", offset); + + if (c->exclusive) { + u8 *active = (u8 *)q->streamState; + u8 *groups = active + c->groupIterOffset; + for (u32 i = mmbit_iterate(groups, c->numGroups, MMB_INVALID); + i != MMB_INVALID; i = mmbit_iterate(groups, c->numGroups, i)) { + u8 *cur = active + i * c->activeIdxSize; + const u32 activeIdx = partial_load_u32(cur, c->activeIdxSize); + DEBUG_PRINTF("subcastle %u\n", activeIdx); + const struct SubCastle *sub = getSubCastle(c, activeIdx); + if (subCastleInAccept(c, q, sub->report, offset, activeIdx)) { + return 1; + } + } + } + + if (c->exclusive != PURE_EXCLUSIVE) { + const u8 *active = (const u8 *)q->streamState + c->activeOffset; + for (u32 i = mmbit_iterate(active, c->numRepeats, MMB_INVALID); + i != MMB_INVALID; i = mmbit_iterate(active, c->numRepeats, i)) { + DEBUG_PRINTF("subcastle %u\n", i); + const struct SubCastle *sub = getSubCastle(c, i); + if (subCastleInAccept(c, q, sub->report, offset, i)) { + return 1; + } + } + } + + return 0; +} + + +char nfaExecCastle_queueInitState(UNUSED const struct NFA *n, struct mq *q) { + assert(n && q); + assert(n->type == CASTLE_NFA); + DEBUG_PRINTF("entry\n"); + + const struct Castle *c = getImplNfa(n); + assert(q->streamState); + if (c->exclusive) { + u8 *groups = (u8 *)q->streamState + c->groupIterOffset; + mmbit_clear(groups, c->numGroups); + } + + if (c->exclusive != PURE_EXCLUSIVE) { + u8 *active = (u8 *)q->streamState + c->activeOffset; + mmbit_clear(active, c->numRepeats); + } + return 0; +} + +char nfaExecCastle_initCompressedState(const struct NFA *n, UNUSED u64a offset, + void *state, UNUSED u8 key) { + assert(n && state); + assert(n->type == CASTLE_NFA); + DEBUG_PRINTF("entry\n"); + + const struct Castle *c = getImplNfa(n); + if (c->exclusive) { + u8 *groups = (u8 *)state + c->groupIterOffset; + mmbit_clear(groups, c->numGroups); + } + + if (c->exclusive != PURE_EXCLUSIVE) { + u8 *active = (u8 *)state + c->activeOffset; + mmbit_clear(active, c->numRepeats); + } + return 0; +} + +static really_inline +void subCastleQueueCompressState(const struct Castle *c, const u32 subIdx, + const struct mq *q, const u64a offset) { + const struct SubCastle *sub = getSubCastle(c, subIdx); + const struct RepeatInfo *info = getRepeatInfo(sub); + union RepeatControl *rctrl = getControl(q->state, sub); + char *packed = (char *)q->streamState + sub->streamStateOffset; + DEBUG_PRINTF("sub %u next match %llu\n", subIdx, + repeatNextMatch(info, rctrl, + packed + info->packedCtrlSize, offset)); + repeatPack(packed, info, rctrl, offset); +} + +char nfaExecCastle_queueCompressState(const struct NFA *n, const struct mq *q, + s64a loc) { + assert(n && q); + assert(n->type == CASTLE_NFA); + DEBUG_PRINTF("entry, loc=%lld\n", loc); + + const struct Castle *c = getImplNfa(n); + + // Pack state for all active repeats. + const u64a offset = q->offset + loc; + DEBUG_PRINTF("offset=%llu\n", offset); + if (c->exclusive) { + u8 *active = (u8 *)q->streamState; + u8 *groups = active + c->groupIterOffset; + for (u32 i = mmbit_iterate(groups, c->numGroups, MMB_INVALID); + i != MMB_INVALID; i = mmbit_iterate(groups, c->numGroups, i)) { + u8 *cur = active + i * c->activeIdxSize; + const u32 activeIdx = partial_load_u32(cur, c->activeIdxSize); + DEBUG_PRINTF("packing state for sub %u\n", activeIdx); + subCastleQueueCompressState(c, activeIdx, q, offset); + } + } + + if (c->exclusive != PURE_EXCLUSIVE) { + const u8 *active = (const u8 *)q->streamState + c->activeOffset; + for (u32 i = mmbit_iterate(active, c->numRepeats, MMB_INVALID); + i != MMB_INVALID; i = mmbit_iterate(active, c->numRepeats, i)) { + DEBUG_PRINTF("packing state for sub %u\n", i); + subCastleQueueCompressState(c, i, q, offset); + } + } + return 0; +} + +static really_inline +void subCastleExpandState(const struct Castle *c, const u32 subIdx, + void *dest, const void *src, const u64a offset) { + const struct SubCastle *sub = getSubCastle(c, subIdx); + const struct RepeatInfo *info = getRepeatInfo(sub); + DEBUG_PRINTF("unpacking state for sub %u\n", subIdx); + union RepeatControl *rctrl = getControl(dest, sub); + const char *packed = (const char *)src + sub->streamStateOffset; + repeatUnpack(packed, info, offset, rctrl); + DEBUG_PRINTF("sub %u next match %llu\n", subIdx, + repeatNextMatch(info, rctrl, + packed + info->packedCtrlSize, offset)); +} + +char nfaExecCastle_expandState(const struct NFA *n, void *dest, const void *src, + u64a offset, UNUSED u8 key) { + assert(n && dest && src); + assert(n->type == CASTLE_NFA); + DEBUG_PRINTF("entry, src=%p, dest=%p, offset=%llu\n", src, dest, offset); + + const struct Castle *c = getImplNfa(n); + + if (c->exclusive) { + const u8 *active = (const u8 *)src; + const u8 *groups = active + c->groupIterOffset; + for (u32 i = mmbit_iterate(groups, c->numGroups, MMB_INVALID); + i != MMB_INVALID; i = mmbit_iterate(groups, c->numGroups, i)) { + const u8 *cur = active + i * c->activeIdxSize; + const u32 activeIdx = partial_load_u32(cur, c->activeIdxSize); + subCastleExpandState(c, activeIdx, dest, src, offset); + } + } + + if (c->exclusive != PURE_EXCLUSIVE) { + // Unpack state for all active repeats. + const u8 *active = (const u8 *)src + c->activeOffset; + for (u32 i = mmbit_iterate(active, c->numRepeats, MMB_INVALID); + i != MMB_INVALID; i = mmbit_iterate(active, c->numRepeats, i)) { + subCastleExpandState(c, i, dest, src, offset); + } + } + return 0; +} diff --git a/regex/nfa/castle.h b/regex/nfa/castle.h new file mode 100644 index 000000000..cc7496ca7 --- /dev/null +++ b/regex/nfa/castle.h @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2015-2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef NFA_CASTLE_H +#define NFA_CASTLE_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include "ue2common.h" + +struct mq; +struct NFA; + +char nfaExecCastle_Q(const struct NFA *n, struct mq *q, s64a end); +char nfaExecCastle_Q2(const struct NFA *n, struct mq *q, s64a end); +char nfaExecCastle_QR(const struct NFA *n, struct mq *q, ReportID report); +char nfaExecCastle_reportCurrent(const struct NFA *n, struct mq *q); +char nfaExecCastle_inAccept(const struct NFA *n, ReportID report, + struct mq *q); +char nfaExecCastle_inAnyAccept(const struct NFA *n, struct mq *q); +char nfaExecCastle_queueInitState(const struct NFA *n, struct mq *q); +char nfaExecCastle_initCompressedState(const struct NFA *n, u64a offset, + void *state, u8 key); +char nfaExecCastle_queueCompressState(const struct NFA *nfa, const struct mq *q, + s64a loc); +char nfaExecCastle_expandState(const struct NFA *nfa, void *dest, + const void *src, u64a offset, u8 key); + +#define nfaExecCastle_testEOD NFA_API_NO_IMPL +#define nfaExecCastle_B_Reverse NFA_API_NO_IMPL +#define nfaExecCastle_zombie_status NFA_API_ZOMBIE_NO_IMPL + +#ifdef __cplusplus +} + +#endif // __cplusplus + +#endif diff --git a/regex/nfa/castle_internal.h b/regex/nfa/castle_internal.h new file mode 100644 index 000000000..429c232ff --- /dev/null +++ b/regex/nfa/castle_internal.h @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2015-2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Castle: multi-tenant repeat engine, data structures. + */ + +#ifndef NFA_CASTLE_INTERNAL_H +#define NFA_CASTLE_INTERNAL_H + +#include "ue2common.h" +#include "repeat_internal.h" + +struct SubCastle { + ReportID report; //!< report to raise on match + u32 fullStateOffset; //!< offset within full state (scratch) + u32 streamStateOffset; //!< offset within stream state + u32 repeatInfoOffset; //!< offset of RepeatInfo structure + // relative to the start of SubCastle + u32 exclusiveId; //!< exclusive group id of this SubCastle, + // set to the number of SubCastles in Castle + // if it is not exclusive +}; + +#define CASTLE_DOT 0 +#define CASTLE_VERM 1 +#define CASTLE_NVERM 2 +#define CASTLE_SHUFTI 3 +#define CASTLE_TRUFFLE 4 + +enum ExclusiveType { + NOT_EXCLUSIVE, //!< no subcastles are exclusive + EXCLUSIVE, //!< a subset of subcastles are exclusive + PURE_EXCLUSIVE //!< all subcastles are exclusive +}; + +/** + * \brief Castle engine structure. + * + * A Castle is a collection of repeats that all share the same character + * reachability. + * + * The whole engine is laid out in memory as: + * + * - struct NFA + * - struct Castle + * - struct SubCastle[numRepeats] + * - tables for sparse model repeats + * - sparse iterator for subcastles that may be stale + * + * Castle stores an "active repeats" multibit in stream state, followed by the + * packed repeat state for each SubCastle. If there are both exclusive and + * non-exclusive SubCastle groups, we use an active id for each exclusive group + * and a multibit for the non-exclusive group. We also store an "active + * exclusive groups" multibit for exclusive groups. If all SubCastles are mutual + * exclusive, we remove "active repeats" multibit from stream state. + * * Castle stream state: + * * + * * |---| + * * | | active subengine id for exclusive group 1 + * * |---| + * * | | active subengine id for exclusive group 2(if necessary) + * * |---| + * * ... + * * |---| + * * | | "active repeats" multibit for non-exclusive subcastles + * * | | (if not all subcastles are exclusive) + * * |---| + * * | | active multibit for exclusive groups + * * | | + * * |---| + * * ||-|| common pool of stream state for exclusive group 1 + * * ||-|| + * * |---| + * * ||-|| common pool of stream state for exclusive group 2(if necessary) + * * ||-|| + * * |---| + * * ... + * * |---| + * * | | stream state for each non-exclusive subcastles + * * ... + * * | | + * * |---| + * + * In full state (stored in scratch space) it stores a temporary multibit over + * the repeats (used by \ref castleMatchLoop), followed by the repeat control + * blocks for each SubCastle. + */ +struct ALIGN_AVX_DIRECTIVE Castle { + u32 numRepeats; //!< number of repeats in Castle + u32 numGroups; //!< number of exclusive groups + u8 type; //!< tells us which scanning mechanism (below) to use + u8 exclusive; //!< tells us if there are mutual exclusive SubCastles + u8 activeIdxSize; //!< number of bytes in stream state to store + // active SubCastle id for exclusive mode + u32 activeOffset; //!< offset to active multibit for non-exclusive + // SubCastles + u32 staleIterOffset; //!< offset to a sparse iterator to check for stale + // sub castles + u32 groupIterOffset; //!< offset to a iterator to check the aliveness of + // exclusive groups + + union { + struct { + char c; + } verm; + struct { + m128 mask_lo; + m128 mask_hi; + } shuf; + struct { + m128 mask1; + m128 mask2; + } truffle; + } u; +}; + +#endif // NFA_CASTLE_INTERNAL_H diff --git a/regex/nfa/gough.c b/regex/nfa/gough.c new file mode 100644 index 000000000..eebd54345 --- /dev/null +++ b/regex/nfa/gough.c @@ -0,0 +1,1147 @@ +/* + * Copyright (c) 2015-2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "gough.h" + +#include "accel.h" +#include "gough_internal.h" +#include "mcclellan.h" +#include "nfa_api.h" +#include "nfa_api_queue.h" +#include "nfa_internal.h" +#include "util/bitutils.h" +#include "util/compare.h" +#include "util/simd_utils.h" +#include "util/unaligned.h" +#include "ue2common.h" +#ifndef __KERNEL__ +#include +#else +#include +#endif + +#include "mcclellan_common_impl.h" + +#define GOUGH_SOM_EARLY (~0ULL) + +static really_inline +void compressSomValue(u32 comp_slot_width, u64a curr_offset, + void *dest_som_base, u32 i, u64a val) { + void *dest_som = (u8 *)dest_som_base + i * comp_slot_width; + /* gough does not initialise all slots, so may contain garbage */ + u64a delta = curr_offset - val; + switch (comp_slot_width) { + case 2: + if (delta >= (u16)~0U) { + delta = GOUGH_SOM_EARLY; + } + unaligned_store_u16(dest_som, delta); + break; + case 4: + if (delta >= (u32)~0U) { + delta = GOUGH_SOM_EARLY; + } + unaligned_store_u32(dest_som, delta); + break; + case 8: + if (delta >= ~0ULL) { + delta = GOUGH_SOM_EARLY; + } + unaligned_store_u64a(dest_som, delta); + break; + default: + assert(0); + } +} + +static really_inline +u64a expandSomValue(u32 comp_slot_width, u64a curr_offset, + const void *src_som_base, u32 i) { + /* Note: gough does not initialise all slots, so we may end up decompressing + * garbage */ + + const void *src_som = (const u8 *)src_som_base + i * comp_slot_width; + u64a val = 0; + switch (comp_slot_width) { + case 2: + val = unaligned_load_u16(src_som); + if (val == (u16)~0U) { + return GOUGH_SOM_EARLY; + } + break; + case 4: + val = unaligned_load_u32(src_som); + if (val == (u32)~0U) { + return GOUGH_SOM_EARLY; + } + break; + case 8: + val = unaligned_load_u64a(src_som); + if (val == ~0ULL) { + return GOUGH_SOM_EARLY; + } + break; + + default: + assert(0); + } + return curr_offset - val; +} + +static really_inline +char doReports(NfaCallback cb, void *ctxt, const struct mcclellan *m, + const struct gough_som_info *som, u16 s, u64a loc, + char eod, u16 * const cached_accept_state, + u32 * const cached_accept_id, u32 * const cached_accept_som) { + DEBUG_PRINTF("reporting state = %hu, loc=%llu, eod %hhu\n", + (u16)(s & STATE_MASK), loc, eod); + + if (!eod && s == *cached_accept_state) { + u64a from = *cached_accept_som == INVALID_SLOT ? loc + : som->slots[*cached_accept_som]; + if (cb(from, loc, *cached_accept_id, ctxt) == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; /* termination requested */ + } + + return MO_CONTINUE_MATCHING; /* continue execution */ + } + + const struct mstate_aux *aux = get_aux(m, s); + size_t offset = eod ? aux->accept_eod : aux->accept; + + assert(offset); + const struct gough_report_list *rl + = (const void *)((const char *)m + offset - sizeof(struct NFA)); + assert(ISALIGNED(rl)); + + DEBUG_PRINTF("report list size %u\n", rl->count); + u32 count = rl->count; + + if (!eod && count == 1) { + *cached_accept_state = s; + *cached_accept_id = rl->report[0].r; + *cached_accept_som = rl->report[0].som; + + u64a from = *cached_accept_som == INVALID_SLOT ? loc + : som->slots[*cached_accept_som]; + DEBUG_PRINTF("reporting %u, using som[%u]=%llu\n", rl->report[0].r, + *cached_accept_som, from); + if (cb(from, loc, *cached_accept_id, ctxt) == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; /* termination requested */ + } + + return MO_CONTINUE_MATCHING; /* continue execution */ + } + + for (u32 i = 0; i < count; i++) { + u32 slot = rl->report[i].som; + u64a from = slot == INVALID_SLOT ? loc : som->slots[slot]; + DEBUG_PRINTF("reporting %u, using som[%u] = %llu\n", + rl->report[i].r, slot, from); + if (cb(from, loc, rl->report[i].r, ctxt) == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; /* termination requested */ + } + } + + return MO_CONTINUE_MATCHING; /* continue execution */ +} + +#ifdef DUMP_SUPPORT +static UNUSED +const char *dump_op(u8 op) { + switch (op) { + case GOUGH_INS_END: + return "END"; + case GOUGH_INS_MOV: + return "MOV"; + case GOUGH_INS_NEW: + return "NEW"; + case GOUGH_INS_MIN: + return "MIN"; + default: + return "???"; + } +} +#endif + +static really_inline +void run_prog_i(UNUSED const struct NFA *nfa, + const struct gough_ins *pc, u64a som_offset, + struct gough_som_info *som) { + DEBUG_PRINTF("run prog at som_offset of %llu\n", som_offset); + while (1) { + assert((const u8 *)pc >= (const u8 *)nfa); + assert((const u8 *)pc < (const u8 *)nfa + nfa->length); + u32 dest = pc->dest; + u32 src = pc->src; + assert(pc->op == GOUGH_INS_END + || dest < (nfa->scratchStateSize - 16) / 8); + DEBUG_PRINTF("%s %u %u\n", dump_op(pc->op), dest, src); + switch (pc->op) { + case GOUGH_INS_END: + return; + case GOUGH_INS_MOV: + som->slots[dest] = som->slots[src]; + break; + case GOUGH_INS_NEW: + /* note: c has already been advanced */ + DEBUG_PRINTF("current offset %llu; adjust %u\n", som_offset, + pc->src); + assert(som_offset >= pc->src); + som->slots[dest] = som_offset - pc->src; + break; + case GOUGH_INS_MIN: + /* TODO: shift all values along by one so that a normal min works + */ + if (som->slots[src] == GOUGH_SOM_EARLY) { + som->slots[dest] = som->slots[src]; + } else if (som->slots[dest] != GOUGH_SOM_EARLY) { + LIMIT_TO_AT_MOST(&som->slots[dest], som->slots[src]); + } + break; + default: + assert(0); + return; + } + DEBUG_PRINTF("dest slot[%u] = %llu\n", dest, som->slots[dest]); + ++pc; + } +} + +static really_inline +void run_prog(const struct NFA *nfa, const u32 *edge_prog_table, + const u8 *buf, u64a offAdj, const u8 *c, u32 edge_num, + struct gough_som_info *som) { + DEBUG_PRINTF("taking edge %u\n", edge_num); + u32 prog_offset = edge_prog_table[edge_num]; + if (!prog_offset) { + DEBUG_PRINTF("no prog on edge\n"); + return; + } + + const struct gough_ins *pc = (const void *)((const u8 *)nfa + prog_offset); + u64a curr_offset = (u64a)(c - buf) + offAdj - 1; + run_prog_i(nfa, pc, curr_offset, som); +} + +static never_inline +void run_accel_prog(const struct NFA *nfa, const struct gough_accel *gacc, + const u8 *buf, u64a offAdj, const u8 *c, const u8 *c2, + struct gough_som_info *som) { + assert(gacc->prog_offset); + assert(c2 > c); + + const struct gough_ins *pc + = (const void *)((const u8 *)nfa + gacc->prog_offset); + s64a margin_dist = gacc->margin_dist; + + DEBUG_PRINTF("run accel after skip %lld margin; advanced %zd\n", + margin_dist, c2 - c); + + if (c2 - c <= 2 * margin_dist) { + while (c < c2) { + u64a curr_offset = (u64a)(c - buf) + offAdj; + run_prog_i(nfa, pc, curr_offset, som); + c++; + } + } else { + u64a curr_offset = (u64a)(c - buf) + offAdj; + for (s64a i = 0; i < margin_dist; i++) { + run_prog_i(nfa, pc, curr_offset + i, som); + } + + curr_offset = (u64a)(c2 - buf) + offAdj - margin_dist; + for (s64a i = 0; i < margin_dist; i++) { + run_prog_i(nfa, pc, curr_offset + i, som); + } + } +} + +static never_inline +u16 goughEnableStarts(const struct mcclellan *m, u16 s, u64a som_offset, + struct gough_som_info *som) { + DEBUG_PRINTF("top triggered while at %hu\n", s); + const struct mstate_aux *aux = get_aux(m, s); + DEBUG_PRINTF("now going to state %hu\n", aux->top); + + const u32 *top_offsets = get_gough_top_offsets(m); + if (!top_offsets) { + return aux->top; + } + + u32 prog_offset = top_offsets[s]; + if (!prog_offset) { + return aux->top; + } + + DEBUG_PRINTF("doing som for top\n"); + const struct NFA *nfa + = (const struct NFA *)((const char *)m - sizeof(struct NFA)); + const struct gough_ins *pc = (const void *)((const u8 *)nfa + + prog_offset); + run_prog_i(nfa, pc, som_offset, som); + return aux->top; +} + +static really_inline +char goughExec16_i(const struct mcclellan *m, struct gough_som_info *som, + u16 *state, const u8 *buf, size_t len, u64a offAdj, + NfaCallback cb, void *ctxt, const u8 **c_final, + enum MatchMode mode) { + assert(ISALIGNED_N(state, 2)); + + u16 s = *state; + const struct NFA *nfa + = (const struct NFA *)((const char *)m - sizeof(struct NFA)); + const u8 *c = buf, *c_end = buf + len; + const u16 *succ_table = (const u16 *)((const char *)m + + sizeof(struct mcclellan)); + assert(ISALIGNED_N(succ_table, 2)); + const u16 sherman_base = m->sherman_limit; + const char *sherman_base_offset + = (const char *)nfa + m->sherman_offset; + const u32 as = m->alphaShift; + + s &= STATE_MASK; + + u32 cached_accept_id = 0; + u16 cached_accept_state = 0; + u32 cached_accept_som = 0; + + const u32 *edge_prog_table = (const u32 *)(get_gough(m) + 1); + + DEBUG_PRINTF("s: %hu, len %zu\n", s, len); + + const u8 *min_accel_offset = c; + if (!m->has_accel || len < ACCEL_MIN_LEN) { + min_accel_offset = c_end; + goto without_accel; + } + + goto with_accel; + +without_accel: + while (c < min_accel_offset && s) { + u8 cprime = m->remap[*(c++)]; + DEBUG_PRINTF("c: %02hhx cp:%02hhx (s=%hu)\n", *(c-1), cprime, s); + + u32 edge_num = ((u32)s << as) + cprime; + run_prog(nfa, edge_prog_table, buf, offAdj, c, edge_num, som); + if (s < sherman_base) { + DEBUG_PRINTF("doing normal\n"); + assert(s < m->state_count); + s = succ_table[edge_num]; + } else { + const char *sherman_state + = findShermanState(m, sherman_base_offset, sherman_base, s); + DEBUG_PRINTF("doing sherman\n"); + s = doSherman16(sherman_state, cprime, succ_table, as); + } + DEBUG_PRINTF("s: %hu (%hu)\n", s, (u16)(s & STATE_MASK)); + + if (mode != NO_MATCHES && (s & ACCEPT_FLAG)) { + if (mode == STOP_AT_MATCH) { + *state = s & STATE_MASK; + *c_final = c - 1; + return MO_CONTINUE_MATCHING; + } + + u64a loc = (c - 1) - buf + offAdj + 1; + if (doReports(cb, ctxt, m, som, s & STATE_MASK, loc, 0, + &cached_accept_state, &cached_accept_id, + &cached_accept_som) == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } + + s &= STATE_MASK; + } + +with_accel: + while (c < c_end && s) { + u8 cprime = m->remap[*(c++)]; + DEBUG_PRINTF("c: %02hhx cp:%02hhx (s=%hu)\n", *(c-1), cprime, s); + + u32 edge_num = ((u32)s << as) + cprime; + run_prog(nfa, edge_prog_table, buf, offAdj, c, edge_num, som); + if (s < sherman_base) { + DEBUG_PRINTF("doing normal\n"); + assert(s < m->state_count); + s = succ_table[edge_num]; + } else { + const char *sherman_state + = findShermanState(m, sherman_base_offset, sherman_base, s); + DEBUG_PRINTF("doing sherman\n"); + s = doSherman16(sherman_state, cprime, succ_table, as); + } + DEBUG_PRINTF("s: %hu (%hu)\n", s, (u16)(s & STATE_MASK)); + + if (mode != NO_MATCHES && (s & ACCEPT_FLAG)) { + if (mode == STOP_AT_MATCH) { + *state = s & STATE_MASK; + *c_final = c - 1; + return MO_CONTINUE_MATCHING; + } + + u64a loc = (c - 1) - buf + offAdj + 1; + + if (doReports(cb, ctxt, m, som, s & STATE_MASK, loc, 0, + &cached_accept_state, &cached_accept_id, + &cached_accept_som) + == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } else if (s & ACCEL_FLAG) { + DEBUG_PRINTF("skipping\n"); + const struct mstate_aux *this_aux = get_aux(m, s & STATE_MASK); + u32 accel_offset = this_aux->accel_offset; + + assert(accel_offset >= m->aux_offset); + assert(accel_offset < m->sherman_offset); + + const struct gough_accel *gacc + = (const void *)((const char *)m + accel_offset); + assert(!gacc->prog_offset == !gacc->margin_dist); + const u8 *c2 = run_accel(&gacc->accel, c, c_end); + + if (c2 != c && gacc->prog_offset) { + run_accel_prog(nfa, gacc, buf, offAdj, c, c2, som); + } + + if (c2 < min_accel_offset + BAD_ACCEL_DIST) { + min_accel_offset = c2 + BIG_ACCEL_PENALTY; + } else { + min_accel_offset = c2 + SMALL_ACCEL_PENALTY; + } + + if (min_accel_offset >= c_end - ACCEL_MIN_LEN) { + min_accel_offset = c_end; + } + + DEBUG_PRINTF("advanced %zd, next accel chance in %zd/%zd\n", + c2 - c, min_accel_offset - c2, c_end - c2); + + c = c2; + s &= STATE_MASK; + goto without_accel; + } + + s &= STATE_MASK; + } + + if (mode == STOP_AT_MATCH) { + *c_final = c_end; + } + *state = s; + + return MO_CONTINUE_MATCHING; +} + +static really_inline +char goughExec8_i(const struct mcclellan *m, struct gough_som_info *som, + u8 *state, const u8 *buf, size_t len, u64a offAdj, + NfaCallback cb, void *ctxt, const u8 **c_final, + enum MatchMode mode) { + u8 s = *state; + const u8 *c = buf, *c_end = buf + len; + const u8 *succ_table = (const u8 *)((const char *)m + + sizeof(struct mcclellan)); + const u32 as = m->alphaShift; + const struct mstate_aux *aux; + + const struct NFA *nfa + = (const struct NFA *)((const char *)m - sizeof(struct NFA)); + aux = (const struct mstate_aux *)((const char *)nfa + m->aux_offset); + + const u32 *edge_prog_table = (const u32 *)(get_gough(m) + 1); + + u16 accel_limit = m->accel_limit_8; + u16 accept_limit = m->accept_limit_8; + + u32 cached_accept_id = 0; + u16 cached_accept_state = 0; + u32 cached_accept_som = 0; + + DEBUG_PRINTF("accel %hu, accept %hu\n", accel_limit, accept_limit); + + DEBUG_PRINTF("s: %hhu, len %zu\n", s, len); + + const u8 *min_accel_offset = c; + if (!m->has_accel || len < ACCEL_MIN_LEN) { + min_accel_offset = c_end; + goto without_accel; + } + + goto with_accel; + +without_accel: + while (c < min_accel_offset && s) { + u8 cprime = m->remap[*(c++)]; + DEBUG_PRINTF("c: %02hhx '%c' cp:%02hhx\n", *(c-1), + ourisprint(*(c-1)) ? *(c-1) : '?', cprime); + + u32 edge_num = ((u32)s << as) + cprime; + + run_prog(nfa, edge_prog_table, buf, offAdj, c, edge_num, som); + + s = succ_table[edge_num]; + DEBUG_PRINTF("s: %hhu\n", s); + + if (mode != NO_MATCHES && s >= accept_limit) { + if (mode == STOP_AT_MATCH) { + DEBUG_PRINTF("match - pausing\n"); + *state = s; + *c_final = c - 1; + return MO_CONTINUE_MATCHING; + } + + u64a loc = (c - 1) - buf + offAdj + 1; + if (doReports(cb, ctxt, m, som, s, loc, 0, + &cached_accept_state, &cached_accept_id, + &cached_accept_som) + == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } + } + +with_accel: + while (c < c_end && s) { + u8 cprime = m->remap[*(c++)]; + DEBUG_PRINTF("c: %02hhx '%c' cp:%02hhx\n", *(c-1), + ourisprint(*(c-1)) ? *(c-1) : '?', cprime); + + u32 edge_num = ((u32)s << as) + cprime; + + run_prog(nfa, edge_prog_table, buf, offAdj, c, edge_num, som); + + s = succ_table[edge_num]; + DEBUG_PRINTF("s: %hhu\n", s); + + if (s >= accel_limit) { /* accept_limit >= accel_limit */ + if (mode != NO_MATCHES && s >= accept_limit) { + if (mode == STOP_AT_MATCH) { + DEBUG_PRINTF("match - pausing\n"); + *state = s; + *c_final = c - 1; + return MO_CONTINUE_MATCHING; + } + + u64a loc = (c - 1) - buf + offAdj + 1; + if (doReports(cb, ctxt, m, som, s, loc, 0, + &cached_accept_state, &cached_accept_id, + &cached_accept_som) + == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } else if (aux[s].accel_offset) { + DEBUG_PRINTF("skipping\n"); + + const struct gough_accel *gacc + = (const void *)((const char *)m + aux[s].accel_offset); + const u8 *c2 = run_accel(&gacc->accel, c, c_end); + + if (c2 != c && gacc->prog_offset) { + run_accel_prog(nfa, gacc, buf, offAdj, c, c2, som); + } + + if (c2 < min_accel_offset + BAD_ACCEL_DIST) { + min_accel_offset = c2 + BIG_ACCEL_PENALTY; + } else { + min_accel_offset = c2 + SMALL_ACCEL_PENALTY; + } + + if (min_accel_offset >= c_end - ACCEL_MIN_LEN) { + min_accel_offset = c_end; + } + + DEBUG_PRINTF("advanced %zd, next accel chance in %zd/%zd\n", + c2 - c, min_accel_offset - c2, c_end - c2); + + c = c2; + goto without_accel; + } + } + } + + *state = s; + if (mode == STOP_AT_MATCH) { + *c_final = c_end; + } + return MO_CONTINUE_MATCHING; +} + +static never_inline +char goughExec8_i_ni(const struct mcclellan *m, struct gough_som_info *som, + u8 *state, const u8 *buf, size_t len, u64a offAdj, + NfaCallback cb, void *ctxt, const u8 **final_point, + enum MatchMode mode) { + return goughExec8_i(m, som, state, buf, len, offAdj, cb, ctxt, final_point, + mode); +} + +static never_inline +char goughExec16_i_ni(const struct mcclellan *m, struct gough_som_info *som, + u16 *state, const u8 *buf, size_t len, u64a offAdj, + NfaCallback cb, void *ctxt, const u8 **final_point, + enum MatchMode mode) { + return goughExec16_i(m, som, state, buf, len, offAdj, cb, ctxt, final_point, + mode); +} + +static really_inline +struct gough_som_info *getSomInfo(char *state_base) { + return (struct gough_som_info *)(state_base + 16); +} + +static really_inline +const struct gough_som_info *getSomInfoConst(const char *state_base) { + return (const struct gough_som_info *)(state_base + 16); +} + +static really_inline +char nfaExecGough8_Q2i(const struct NFA *n, u64a offset, const u8 *buffer, + const u8 *hend, NfaCallback cb, void *context, + struct mq *q, s64a end, enum MatchMode mode) { + DEBUG_PRINTF("enter\n"); + struct gough_som_info *som = getSomInfo(q->state); + assert(n->type == GOUGH_NFA_8); + const struct mcclellan *m = (const struct mcclellan *)getImplNfa(n); + s64a sp; + u8 s = *(u8 *)q->state; + + if (q->report_current) { + assert(s); + assert(s >= m->accept_limit_8); + + u32 cached_accept_id = 0; + u16 cached_accept_state = 0; + u32 cached_accept_som = 0; + + int rv = doReports(cb, context, m, som, s, q_cur_offset(q), 0, + &cached_accept_state, &cached_accept_id, + &cached_accept_som); + + q->report_current = 0; + + if (rv == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } + + sp = q_cur_loc(q); + q->cur++; + + const u8 *cur_buf = sp < 0 ? hend : buffer; + + if (mode != NO_MATCHES && q->items[q->cur - 1].location > end) { + /* this is as far as we go */ + q->cur--; + q->items[q->cur].type = MQE_START; + q->items[q->cur].location = end; + *(u8 *)q->state = s; + return MO_ALIVE; + } + + while (1) { + DEBUG_PRINTF("%s @ %llu [som %llu]\n", + q->items[q->cur].type == MQE_TOP ? "TOP" : + q->items[q->cur].type == MQE_END ? "END" : "???", + q->items[q->cur].location + offset, q->items[q->cur].som); + assert(q->cur < q->end); + s64a ep = q->items[q->cur].location; + if (mode != NO_MATCHES) { + ep = MIN(ep, end); + } + + assert(ep >= sp); + DEBUG_PRINTF("run to %lld from %lld\n", ep, sp); + + s64a local_ep = ep; + if (sp < 0) { + local_ep = MIN(0, ep); + } + + const u8 *final_look; + if (goughExec8_i_ni(m, som, &s, cur_buf + sp, local_ep - sp, + offset + sp, cb, context, &final_look, mode) + == MO_HALT_MATCHING) { + *(u8 *)q->state = 0; + return 0; + } + if (mode == STOP_AT_MATCH && final_look != cur_buf + local_ep) { + /* found a match */ + DEBUG_PRINTF("found a match\n"); + assert(q->cur); + q->cur--; + q->items[q->cur].type = MQE_START; + q->items[q->cur].location = final_look - cur_buf + 1; /* due to + * early -1 */ + *(u8 *)q->state = s; + return MO_MATCHES_PENDING; + } + + assert(q->cur); + if (mode != NO_MATCHES && q->items[q->cur].location > end) { + /* this is as far as we go */ + assert(q->cur); + q->cur--; + q->items[q->cur].type = MQE_START; + q->items[q->cur].location = end; + *(u8 *)q->state = s; + return MO_ALIVE; + } + + sp = local_ep; + + if (sp == 0) { + cur_buf = buffer; + } + + if (sp != ep) { + continue; + } + + switch (q->items[q->cur].type) { + case MQE_TOP: + assert(!s || sp + offset > 0); + if (sp + offset == 0) { + s = (u8)m->start_anchored; + break; + } + s = goughEnableStarts(m, s, q->items[q->cur].som, som); + break; + case MQE_END: + *(u8 *)q->state = s; + q->cur++; + return s ? MO_ALIVE : 0; + default: + assert(!"invalid queue event"); + } + + q->cur++; + } +} + + +static really_inline +char nfaExecGough16_Q2i(const struct NFA *n, u64a offset, const u8 *buffer, + const u8 *hend, NfaCallback cb, void *context, + struct mq *q, s64a end, enum MatchMode mode) { + struct gough_som_info *som = getSomInfo(q->state); + assert(n->type == GOUGH_NFA_16); + const struct mcclellan *m = (const struct mcclellan *)getImplNfa(n); + s64a sp; + + assert(ISALIGNED_N(q->state, 2)); + u16 s = *(u16 *)q->state; + + if (q->report_current) { + assert(s); + assert(get_aux(m, s)->accept); + + u32 cached_accept_id = 0; + u16 cached_accept_state = 0; + u32 cached_accept_som = 0; + + int rv = doReports(cb, context, m, som, s, q_cur_offset(q), 0, + &cached_accept_state, &cached_accept_id, + &cached_accept_som); + + q->report_current = 0; + + if (rv == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } + + sp = q_cur_loc(q); + q->cur++; + + const u8 *cur_buf = sp < 0 ? hend : buffer; + + assert(q->cur); + if (mode != NO_MATCHES && q->items[q->cur - 1].location > end) { + /* this is as far as we go */ + q->cur--; + q->items[q->cur].type = MQE_START; + q->items[q->cur].location = end; + *(u16 *)q->state = s; + return MO_ALIVE; + } + + while (1) { + assert(q->cur < q->end); + s64a ep = q->items[q->cur].location; + if (mode != NO_MATCHES) { + ep = MIN(ep, end); + } + + assert(ep >= sp); + + s64a local_ep = ep; + if (sp < 0) { + local_ep = MIN(0, ep); + } + + /* do main buffer region */ + const u8 *final_look; + if (goughExec16_i_ni(m, som, &s, cur_buf + sp, local_ep - sp, + offset + sp, cb, context, &final_look, mode) + == MO_HALT_MATCHING) { + *(u16 *)q->state = 0; + return 0; + } + if (mode == STOP_AT_MATCH && final_look != cur_buf + local_ep) { + /* this is as far as we go */ + assert(q->cur); + DEBUG_PRINTF("state %hu final_look %zd\n", s, + final_look - cur_buf); + q->cur--; + q->items[q->cur].type = MQE_START; + q->items[q->cur].location = final_look - cur_buf + 1; /* due to + * early -1 */ + *(u16 *)q->state = s; + return MO_MATCHES_PENDING; + } + + assert(q->cur); + if (mode != NO_MATCHES && q->items[q->cur].location > end) { + /* this is as far as we go */ + q->cur--; + q->items[q->cur].type = MQE_START; + q->items[q->cur].location = end; + *(u16 *)q->state = s; + return MO_ALIVE; + } + + sp = local_ep; + + if (sp == 0) { + cur_buf = buffer; + } + + if (sp != ep) { + continue; + } + + switch (q->items[q->cur].type) { + case MQE_TOP: + assert(!s || sp + offset > 0); + if (sp + offset == 0) { + s = m->start_anchored; + break; + } + s = goughEnableStarts(m, s, q->items[q->cur].som, som); + break; + case MQE_END: + *(u16 *)q->state = s; + q->cur++; + return s ? MO_ALIVE : 0; + default: + assert(!"invalid queue event"); + } + + q->cur++; + } +} + +char nfaExecGough8_Q(const struct NFA *n, struct mq *q, s64a end) { + u64a offset = q->offset; + const u8 *buffer = q->buffer; + NfaCallback cb = q->cb; + void *context = q->context; + assert(n->type == GOUGH_NFA_8); + const u8 *hend = q->history + q->hlength; + + return nfaExecGough8_Q2i(n, offset, buffer, hend, cb, context, q, end, + CALLBACK_OUTPUT); +} + +char nfaExecGough16_Q(const struct NFA *n, struct mq *q, s64a end) { + u64a offset = q->offset; + const u8 *buffer = q->buffer; + NfaCallback cb = q->cb; + void *context = q->context; + assert(n->type == GOUGH_NFA_16); + const u8 *hend = q->history + q->hlength; + + return nfaExecGough16_Q2i(n, offset, buffer, hend, cb, context, q, end, + CALLBACK_OUTPUT); +} + +char nfaExecGough8_Q2(const struct NFA *n, struct mq *q, s64a end) { + u64a offset = q->offset; + const u8 *buffer = q->buffer; + NfaCallback cb = q->cb; + void *context = q->context; + assert(n->type == GOUGH_NFA_8); + const u8 *hend = q->history + q->hlength; + + return nfaExecGough8_Q2i(n, offset, buffer, hend, cb, context, q, end, + STOP_AT_MATCH); +} + +char nfaExecGough16_Q2(const struct NFA *n, struct mq *q, s64a end) { + u64a offset = q->offset; + const u8 *buffer = q->buffer; + NfaCallback cb = q->cb; + void *context = q->context; + assert(n->type == GOUGH_NFA_16); + const u8 *hend = q->history + q->hlength; + + return nfaExecGough16_Q2i(n, offset, buffer, hend, cb, context, q, end, + STOP_AT_MATCH); +} + +char nfaExecGough8_QR(const struct NFA *n, struct mq *q, ReportID report) { + u64a offset = q->offset; + const u8 *buffer = q->buffer; + NfaCallback cb = q->cb; + void *context = q->context; + assert(n->type == GOUGH_NFA_8); + const u8 *hend = q->history + q->hlength; + + char rv = nfaExecGough8_Q2i(n, offset, buffer, hend, cb, context, q, + 0 /* end */, NO_MATCHES); + if (rv && nfaExecMcClellan8_inAccept(n, report, q)) { + return MO_MATCHES_PENDING; + } else { + return rv; + } +} + +char nfaExecGough16_QR(const struct NFA *n, struct mq *q, ReportID report) { + u64a offset = q->offset; + const u8 *buffer = q->buffer; + NfaCallback cb = q->cb; + void *context = q->context; + assert(n->type == GOUGH_NFA_16); + const u8 *hend = q->history + q->hlength; + + char rv = nfaExecGough16_Q2i(n, offset, buffer, hend, cb, context, q, + 0 /* end */, NO_MATCHES); + + if (rv && nfaExecMcClellan16_inAccept(n, report, q)) { + return MO_MATCHES_PENDING; + } else { + return rv; + } +} + +char nfaExecGough8_initCompressedState(const struct NFA *nfa, u64a offset, + void *state, UNUSED u8 key) { + const struct mcclellan *m = (const struct mcclellan *)getImplNfa(nfa); + memset(state, 0, nfa->streamStateSize); + u8 s = offset ? m->start_floating : m->start_anchored; + if (s) { + *(u8 *)state = s; + return 1; + } + return 0; +} + +char nfaExecGough16_initCompressedState(const struct NFA *nfa, u64a offset, + void *state, UNUSED u8 key) { + const struct mcclellan *m = (const struct mcclellan *)getImplNfa(nfa); + memset(state, 0, nfa->streamStateSize); + u16 s = offset ? m->start_floating : m->start_anchored; + if (s) { + unaligned_store_u16(state, s); + return 1; + } + return 0; +} + + +char nfaExecGough8_reportCurrent(const struct NFA *n, struct mq *q) { + const struct mcclellan *m = (const struct mcclellan *)getImplNfa(n); + NfaCallback cb = q->cb; + void *ctxt = q->context; + u8 s = *(u8 *)q->state; + u64a offset = q_cur_offset(q); + struct gough_som_info *som = getSomInfo(q->state); + assert(q_cur_type(q) == MQE_START); + assert(s); + + if (s >= m->accept_limit_8) { + u32 cached_accept_id = 0; + u16 cached_accept_state = 0; + u32 cached_accept_som = 0; + + doReports(cb, ctxt, m, som, s, offset, 0, &cached_accept_state, + &cached_accept_id, &cached_accept_som); + } + + return 0; +} + +char nfaExecGough16_reportCurrent(const struct NFA *n, struct mq *q) { + const struct mcclellan *m = (const struct mcclellan *)getImplNfa(n); + NfaCallback cb = q->cb; + void *ctxt = q->context; + u16 s = *(u16 *)q->state; + const struct mstate_aux *aux = get_aux(m, s); + u64a offset = q_cur_offset(q); + struct gough_som_info *som = getSomInfo(q->state); + assert(q_cur_type(q) == MQE_START); + DEBUG_PRINTF("state %hu\n", s); + assert(s); + + if (aux->accept) { + u32 cached_accept_id = 0; + u16 cached_accept_state = 0; + u32 cached_accept_som = 0; + + doReports(cb, ctxt, m, som, s, offset, 0, &cached_accept_state, + &cached_accept_id, &cached_accept_som); + } + + return 0; +} + +char nfaExecGough8_inAccept(const struct NFA *n, ReportID report, + struct mq *q) { + return nfaExecMcClellan8_inAccept(n, report, q); +} + +char nfaExecGough16_inAccept(const struct NFA *n, ReportID report, + struct mq *q) { + return nfaExecMcClellan16_inAccept(n, report, q); +} + +char nfaExecGough8_inAnyAccept(const struct NFA *n, struct mq *q) { + return nfaExecMcClellan8_inAnyAccept(n, q); +} + +char nfaExecGough16_inAnyAccept(const struct NFA *n, struct mq *q) { + return nfaExecMcClellan16_inAnyAccept(n, q); +} + +static +char goughCheckEOD(const struct NFA *nfa, u16 s, + const struct gough_som_info *som, + u64a offset, NfaCallback cb, void *ctxt) { + const struct mcclellan *m = (const struct mcclellan *)getImplNfa(nfa); + const struct mstate_aux *aux = get_aux(m, s); + + if (!aux->accept_eod) { + return MO_CONTINUE_MATCHING; + } + return doReports(cb, ctxt, m, som, s, offset, 1, NULL, NULL, NULL); +} + +char nfaExecGough8_testEOD(const struct NFA *nfa, const char *state, + UNUSED const char *streamState, u64a offset, + NfaCallback callback, void *context) { + const struct gough_som_info *som = getSomInfoConst(state); + return goughCheckEOD(nfa, *(const u8 *)state, som, offset, callback, + context); +} + +char nfaExecGough16_testEOD(const struct NFA *nfa, const char *state, + UNUSED const char *streamState, u64a offset, + NfaCallback callback, void *context) { + assert(ISALIGNED_N(state, 8)); + const struct gough_som_info *som = getSomInfoConst(state); + return goughCheckEOD(nfa, *(const u16 *)state, som, offset, callback, + context); +} + +char nfaExecGough8_queueInitState(UNUSED const struct NFA *nfa, struct mq *q) { + memset(q->state, 0, nfa->scratchStateSize); + return 0; +} + +char nfaExecGough16_queueInitState(UNUSED const struct NFA *nfa, struct mq *q) { + memset(q->state, 0, nfa->scratchStateSize); + assert(ISALIGNED_N(q->state, 2)); + return 0; +} + +static really_inline +void compSomSpace(const struct NFA *nfa, u8 *dest_som_base, + const struct gough_som_info *src, u64a curr_offset) { + const struct mcclellan *m = (const struct mcclellan *)getImplNfa(nfa); + const struct gough_info *gi = get_gough(m); + u32 count = gi->stream_som_loc_count; + u32 width = gi->stream_som_loc_width; + + for (u32 i = 0; i < count; i++) { + compressSomValue(width, curr_offset, dest_som_base, i, src->slots[i]); + } +} + +static really_inline +void expandSomSpace(const struct NFA *nfa, struct gough_som_info *som, + const u8 *src_som_base, u64a curr_offset) { + const struct mcclellan *m = (const struct mcclellan *)getImplNfa(nfa); + const struct gough_info *gi = get_gough(m); + u32 count = gi->stream_som_loc_count; + u32 width = gi->stream_som_loc_width; + + for (u32 i = 0; i < count; i++) { + som->slots[i] = expandSomValue(width, curr_offset, src_som_base, i); + } +} + +char nfaExecGough8_queueCompressState(const struct NFA *nfa, const struct mq *q, + s64a loc) { + void *dest = q->streamState; + const void *src = q->state; + + *(u8 *)dest = *(const u8 *)src; + compSomSpace(nfa, (u8 *)dest + 1, getSomInfoConst(src), q->offset + loc); + return 0; +} + +char nfaExecGough8_expandState(const struct NFA *nfa, void *dest, + const void *src, u64a offset, UNUSED u8 key) { + *(u8 *)dest = *(const u8 *)src; + expandSomSpace(nfa, getSomInfo(dest), (const u8 *)src + 1, offset); + return 0; +} + +char nfaExecGough16_queueCompressState(const struct NFA *nfa, + const struct mq *q, s64a loc) { + void *dest = q->streamState; + const void *src = q->state; + + assert(ISALIGNED_N(src, 2)); + unaligned_store_u16(dest, *(const u16 *)(src)); + compSomSpace(nfa, (u8 *)dest + 2, getSomInfoConst(src), q->offset + loc); + return 0; +} + +char nfaExecGough16_expandState(const struct NFA *nfa, void *dest, + const void *src, u64a offset, UNUSED u8 key) { + assert(ISALIGNED_N(dest, 2)); + *(u16 *)dest = unaligned_load_u16(src); + expandSomSpace(nfa, getSomInfo(dest), (const u8 *)src + 2, offset); + return 0; +} diff --git a/regex/nfa/gough.h b/regex/nfa/gough.h new file mode 100644 index 000000000..a7f488923 --- /dev/null +++ b/regex/nfa/gough.h @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2015-2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef GOUGH_H +#define GOUGH_H + +#include "callback.h" +#include "ue2common.h" + +struct NFA; +struct mq; + +// 8-bit Gough + +char nfaExecGough8_testEOD(const struct NFA *nfa, const char *state, + const char *streamState, u64a offset, + NfaCallback callback, void *context); +char nfaExecGough8_Q(const struct NFA *n, struct mq *q, s64a end); +char nfaExecGough8_Q2(const struct NFA *n, struct mq *q, s64a end); +char nfaExecGough8_QR(const struct NFA *n, struct mq *q, ReportID report); +char nfaExecGough8_reportCurrent(const struct NFA *n, struct mq *q); +char nfaExecGough8_inAccept(const struct NFA *n, ReportID report, struct mq *q); +char nfaExecGough8_inAnyAccept(const struct NFA *n, struct mq *q); +char nfaExecGough8_queueInitState(const struct NFA *n, struct mq *q); +char nfaExecGough8_initCompressedState(const struct NFA *n, u64a offset, + void *state, u8 key); +char nfaExecGough8_queueCompressState(const struct NFA *nfa, const struct mq *q, + s64a loc); +char nfaExecGough8_expandState(const struct NFA *nfa, void *dest, + const void *src, u64a offset, u8 key); + +#define nfaExecGough8_B_Reverse NFA_API_NO_IMPL +#define nfaExecGough8_zombie_status NFA_API_ZOMBIE_NO_IMPL + +// 16-bit Gough + +char nfaExecGough16_testEOD(const struct NFA *nfa, const char *state, + const char *streamState, u64a offset, + NfaCallback callback, void *context); +char nfaExecGough16_Q(const struct NFA *n, struct mq *q, s64a end); +char nfaExecGough16_Q2(const struct NFA *n, struct mq *q, s64a end); +char nfaExecGough16_QR(const struct NFA *n, struct mq *q, ReportID report); +char nfaExecGough16_reportCurrent(const struct NFA *n, struct mq *q); +char nfaExecGough16_inAccept(const struct NFA *n, ReportID report, struct mq *q); +char nfaExecGough16_inAnyAccept(const struct NFA *n, struct mq *q); +char nfaExecGough16_queueInitState(const struct NFA *n, struct mq *q); +char nfaExecGough16_initCompressedState(const struct NFA *n, u64a offset, + void *state, u8 key); +char nfaExecGough16_queueCompressState(const struct NFA *nfa, + const struct mq *q, s64a loc); +char nfaExecGough16_expandState(const struct NFA *nfa, void *dest, + const void *src, u64a offset, u8 key); + +#define nfaExecGough16_B_Reverse NFA_API_NO_IMPL +#define nfaExecGough16_zombie_status NFA_API_ZOMBIE_NO_IMPL + +#endif diff --git a/regex/nfa/gough_internal.h b/regex/nfa/gough_internal.h new file mode 100644 index 000000000..8bf06e0f7 --- /dev/null +++ b/regex/nfa/gough_internal.h @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2015, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef GOUGH_INTERNAL_H +#define GOUGH_INTERNAL_H + +#include "accel.h" +#include "mcclellan_internal.h" +#include "ue2common.h" + +#define INVALID_SLOT (~0U) + +#define GOUGH_INS_END 0 +#define GOUGH_INS_MOV 1 +#define GOUGH_INS_NEW 2 +#define GOUGH_INS_MIN 3 +/* todo: add instructions targeting acc reg? */ + +struct gough_ins { + u32 op; /* u32 to avoid padding */ + u32 dest; + u32 src; /* for GOUGH_INS_NEW, this specifies the adjustment to apply to the + * current offset */ +}; + +/* + * HAPPY FUN ASCII ART TIME + * + * ---- + * | | struct NFA + * ---- + * ~~~~ normal(ish) mcclellan engine + * ~~~~ + * ~~~~ + * ~~~~ + * ~~~~ + * ~~~~ + * ~~~~ + * ~~~~ + * ---- = m->haig_offset + * | | } struct gough_info + * ---- + * | | } + * | | } edge prog table -> provides the offset of the start of the program + * | | } to run when the edge is taken. 0 indicates no + * | | } work to do + * ---- = h->top_prog_offset + * | | } + * | | } top prog table -> provides the offset of the start of the program + * | | } to run when a top is taken from this state. 0 + * | | } indicates nothing to do + * ---- = h->prog_base_offset + * | | } + * | | } programs to run + * | | } + * | | } + * ---- + */ + +struct gough_info { + u32 top_prog_offset; /**< offset to the base of the top prog table */ + u32 prog_base_offset; /**< not used at runtime */ + u32 stream_som_loc_count; /**< number of som locs in the stream state */ + u8 stream_som_loc_width; /**< number of bytes per som loc */ +}; + +static really_inline +const struct gough_info *get_gough(const struct mcclellan *m) { + assert(m->haig_offset); + const char *n = (const char *)m - sizeof(struct NFA); + return (const struct gough_info *)(n + m->haig_offset); +} + +static really_inline +const u32 *get_gough_top_offsets(const struct mcclellan *m) { + const struct gough_info *g = get_gough(m); + if (!g->top_prog_offset) { + return NULL; + } + const char *n = (const char *)m - sizeof(struct NFA); + return (const u32 *)(n + g->top_prog_offset); +} + +/* Gough state representation in scratch. + * + * During execution, gough tracks a number of variables containing potential + * starts of match. These are all stored in a large array of u64a slots. + */ +struct gough_som_info { + u64a slots[1]; /* 'flexible' member array */ +}; + +struct gough_report { + ReportID r; + u32 som; /* som slot to report */ +}; + +struct gough_report_list { + u32 count; + struct gough_report report[]; +}; + +struct gough_accel { + union AccelAux accel; + u8 margin_dist; + u32 prog_offset; +}; + +#endif diff --git a/regex/nfa/lbr.c b/regex/nfa/lbr.c new file mode 100644 index 000000000..d403733a6 --- /dev/null +++ b/regex/nfa/lbr.c @@ -0,0 +1,531 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Large Bounded Repeat (LBR) engine: runtime code. + */ +#include "lbr.h" + +#include "lbr_internal.h" +#include "nfa_api.h" +#include "nfa_api_queue.h" +#include "nfa_internal.h" +#include "repeat.h" +#include "repeat_internal.h" +#include "shufti.h" +#include "truffle.h" +#include "vermicelli.h" +#include "util/partial_store.h" +#include "util/unaligned.h" + +/** \brief Sentinel value used to indicate that a repeat is dead/empty/unused. + * * */ +#define REPEAT_DEAD 0xffffffffffffffffull + +enum MatchMode { + CALLBACK_OUTPUT, + STOP_AT_MATCH, +}; + +static really_inline +const struct RepeatInfo *getRepeatInfo(const struct lbr_common *l) { + const struct RepeatInfo *repeatInfo = + (const struct RepeatInfo *)((const char *)l + l->repeatInfoOffset); + return repeatInfo; +} + +static really_inline +void lbrCompressState(const struct lbr_common *l, u64a offset, + const struct lbr_state *lstate, char *stream_state) { + assert(l && lstate && stream_state); + assert(ISALIGNED(lstate)); + + const struct RepeatInfo *info = getRepeatInfo(l); + repeatPack(stream_state, info, &lstate->ctrl, offset); +} + +static really_inline +void lbrExpandState(const struct lbr_common *l, u64a offset, + const char *stream_state, struct lbr_state *lstate) { + assert(l && stream_state && lstate); + assert(ISALIGNED(lstate)); + + const struct RepeatInfo *info = getRepeatInfo(l); + repeatUnpack(stream_state, info, offset, &lstate->ctrl); + lstate->lastEscape = 0; +} + +static really_inline +void clearRepeat(const struct RepeatInfo *info, struct lbr_state *lstate) { + assert(info && lstate); + + DEBUG_PRINTF("clear repeat at %p\n", lstate); + + switch ((enum RepeatType)info->type) { + case REPEAT_RING: + lstate->ctrl.ring.offset = REPEAT_DEAD; + break; + case REPEAT_RANGE: + lstate->ctrl.range.offset = REPEAT_DEAD; + break; + case REPEAT_FIRST: + case REPEAT_LAST: + lstate->ctrl.offset.offset = REPEAT_DEAD; + break; + case REPEAT_BITMAP: + lstate->ctrl.bitmap.offset = REPEAT_DEAD; + break; + case REPEAT_SPARSE_OPTIMAL_P: + lstate->ctrl.ring.offset = REPEAT_DEAD; + break; + case REPEAT_TRAILER: + lstate->ctrl.trailer.offset = REPEAT_DEAD; + break; + default: + assert(0); + break; + } +} + +static really_inline +char repeatIsDead(const struct RepeatInfo *info, + const struct lbr_state *lstate) { + assert(info && lstate); + + switch ((enum RepeatType)info->type) { + case REPEAT_RING: + return lstate->ctrl.ring.offset == REPEAT_DEAD; + case REPEAT_RANGE: + return lstate->ctrl.range.offset == REPEAT_DEAD; + case REPEAT_FIRST: + case REPEAT_LAST: + return lstate->ctrl.offset.offset == REPEAT_DEAD; + case REPEAT_BITMAP: + return lstate->ctrl.bitmap.offset == REPEAT_DEAD; + case REPEAT_SPARSE_OPTIMAL_P: + return lstate->ctrl.ring.offset == REPEAT_DEAD; + case REPEAT_TRAILER: + return lstate->ctrl.trailer.offset == REPEAT_DEAD; + case REPEAT_ALWAYS: + assert(!"REPEAT_ALWAYS should only be used by Castle"); + return 0; + } + + assert(0); + return 1; +} + +/** Returns true if the LBR can produce matches at offsets greater than the + * given one. TODO: can this be combined with lbrIsActive? */ +static really_inline +char lbrIsAlive(const struct lbr_common *l, const struct lbr_state *lstate, + const char *state, u64a offset) { + assert(l && lstate && state); + + const struct RepeatInfo *info = getRepeatInfo(l); + if (repeatIsDead(info, lstate)) { + DEBUG_PRINTF("repeat is dead\n"); + return 0; + } + + if (info->repeatMax == REPEAT_INF) { + DEBUG_PRINTF("active repeat with inf max bound, alive\n"); + return 1; + } + + assert(info->repeatMax < REPEAT_INF); + const char *repeatState = state + info->packedCtrlSize; + u64a lastTop = repeatLastTop(info, &lstate->ctrl, repeatState); + if (offset < lastTop + info->repeatMax) { + DEBUG_PRINTF("alive, as we can still produce matches after %llu\n", + offset); + return 1; + } + + DEBUG_PRINTF("dead\n"); + return 0; +} + +/** Returns true if the LBR is matching at the given offset or it could produce + * a match in the future. */ +static really_inline +char lbrIsActive(const struct lbr_common *l, const struct lbr_state *lstate, + const char *state, u64a offset) { + assert(l && lstate && state); + const struct RepeatInfo *info = getRepeatInfo(l); + assert(!repeatIsDead(info, lstate)); // Guaranteed by caller. + + const char *repeatState = state + info->packedCtrlSize; + if (repeatHasMatch(info, &lstate->ctrl, repeatState, offset) == + REPEAT_MATCH) { + DEBUG_PRINTF("currently matching\n"); + return 1; + } + + u64a i = repeatNextMatch(info, &lstate->ctrl, repeatState, offset); + if (i != 0) { + DEBUG_PRINTF("active, next match is at %llu\n", i); + return 1; + } + + DEBUG_PRINTF("no more matches\n"); + return 0; +} + +static really_inline +void lbrTop(const struct lbr_common *l, struct lbr_state *lstate, char *state, + u64a offset) { + assert(l && lstate && state); + DEBUG_PRINTF("top at %llu\n", offset); + + const struct RepeatInfo *info = getRepeatInfo(l); + char *repeatState = state + info->packedCtrlSize; + + char is_alive = !repeatIsDead(info, lstate); + if (is_alive) { + // Ignore duplicate TOPs. + u64a last = repeatLastTop(info, &lstate->ctrl, repeatState); + assert(last <= offset); + if (last == offset) { + return; + } + } + + repeatStore(info, &lstate->ctrl, repeatState, offset, is_alive); +} + +static really_inline +char lbrInAccept(const struct lbr_common *l, const struct lbr_state *lstate, + const char *state, u64a offset, ReportID report) { + assert(l && lstate && state); + DEBUG_PRINTF("offset=%llu, report=%u\n", offset, report); + + if (report != l->report) { + DEBUG_PRINTF("report=%u is not LBR report %u\n", report, l->report); + return 0; + } + + const struct RepeatInfo *info = getRepeatInfo(l); + assert(!repeatIsDead(info, lstate)); // Guaranteed by caller. + + const char *repeatState = state + info->packedCtrlSize; + return repeatHasMatch(info, &lstate->ctrl, repeatState, offset) == + REPEAT_MATCH; +} + +static really_inline +char lbrFindMatch(const struct lbr_common *l, const u64a begin, const u64a end, + const struct lbr_state *lstate, const char *state, + size_t *mloc) { + DEBUG_PRINTF("begin=%llu, end=%llu\n", begin, end); + assert(begin <= end); + + if (begin == end) { + return 0; + } + + const struct RepeatInfo *info = getRepeatInfo(l); + const char *repeatState = state + info->packedCtrlSize; + u64a i = repeatNextMatch(info, &lstate->ctrl, repeatState, begin); + if (i == 0) { + DEBUG_PRINTF("no more matches\n"); + return 0; + } + if (i > end) { + DEBUG_PRINTF("next match at %llu is beyond the horizon\n", i); + return 0; + } + + DEBUG_PRINTF("stop at match at %llu\n", i); + assert(mloc); + *mloc = i - begin; + return 1; +} + +static really_inline +char lbrMatchLoop(const struct lbr_common *l, const u64a begin, const u64a end, + const struct lbr_state *lstate, const char *state, + NfaCallback cb, void *ctx) { + DEBUG_PRINTF("begin=%llu, end=%llu\n", begin, end); + assert(begin <= end); + + if (begin == end) { + return MO_CONTINUE_MATCHING; + } + + const struct RepeatInfo *info = getRepeatInfo(l); + const char *repeatState = state + info->packedCtrlSize; + + u64a i = begin; + for (;;) { + i = repeatNextMatch(info, &lstate->ctrl, repeatState, i); + if (i == 0) { + DEBUG_PRINTF("no more matches\n"); + return MO_CONTINUE_MATCHING; + } + if (i > end) { + DEBUG_PRINTF("next match at %llu is beyond the horizon\n", i); + return MO_CONTINUE_MATCHING; + } + + DEBUG_PRINTF("firing match at %llu\n", i); + if (cb(0, i, l->report, ctx) == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } + + assert(0); + return MO_CONTINUE_MATCHING; +} + +static really_inline +char lbrRevScanDot(UNUSED const struct NFA *nfa, UNUSED const u8 *buf, + UNUSED size_t begin, UNUSED size_t end, + UNUSED size_t *loc) { + assert(begin <= end); + assert(nfa->type == LBR_NFA_DOT); + // Nothing can kill a dot! + return 0; +} + +static really_inline +char lbrRevScanVerm(const struct NFA *nfa, const u8 *buf, + size_t begin, size_t end, size_t *loc) { + assert(begin <= end); + assert(nfa->type == LBR_NFA_VERM); + const struct lbr_verm *l = getImplNfa(nfa); + + if (begin == end) { + return 0; + } + + const u8 *ptr = rvermicelliExec(l->c, 0, buf + begin, buf + end); + if (ptr == buf + begin - 1) { + DEBUG_PRINTF("no escape found\n"); + return 0; + } + + assert(loc); + *loc = (size_t)(ptr - buf); + DEBUG_PRINTF("escape found at offset %zu\n", *loc); + assert((char)*ptr == l->c); + return 1; +} + +static really_inline +char lbrRevScanNVerm(const struct NFA *nfa, const u8 *buf, + size_t begin, size_t end, size_t *loc) { + assert(begin <= end); + assert(nfa->type == LBR_NFA_NVERM); + const struct lbr_verm *l = getImplNfa(nfa); + + if (begin == end) { + return 0; + } + + const u8 *ptr = rnvermicelliExec(l->c, 0, buf + begin, buf + end); + if (ptr == buf + begin - 1) { + DEBUG_PRINTF("no escape found\n"); + return 0; + } + + assert(loc); + *loc = (size_t)(ptr - buf); + DEBUG_PRINTF("escape found at offset %zu\n", *loc); + assert((char)*ptr != l->c); + return 1; +} + +static really_inline +char lbrRevScanShuf(const struct NFA *nfa, const u8 *buf, + size_t begin, size_t end, + size_t *loc) { + assert(begin <= end); + assert(nfa->type == LBR_NFA_SHUF); + const struct lbr_shuf *l = getImplNfa(nfa); + + if (begin == end) { + return 0; + } + + const u8 *ptr = rshuftiExec(l->mask_lo, l->mask_hi, buf + begin, buf + end); + if (ptr == buf + begin - 1) { + DEBUG_PRINTF("no escape found\n"); + return 0; + } + + assert(loc); + *loc = (size_t)(ptr - buf); + DEBUG_PRINTF("escape found at offset %zu\n", *loc); + return 1; +} + +static really_inline +char lbrRevScanTruf(const struct NFA *nfa, const u8 *buf, + size_t begin, size_t end, + size_t *loc) { + assert(begin <= end); + assert(nfa->type == LBR_NFA_TRUF); + const struct lbr_truf *l = getImplNfa(nfa); + + if (begin == end) { + return 0; + } + + const u8 *ptr = rtruffleExec(l->mask1, l->mask2, buf + begin, buf + end); + if (ptr == buf + begin - 1) { + DEBUG_PRINTF("no escape found\n"); + return 0; + } + + assert(loc); + *loc = (size_t)(ptr - buf); + DEBUG_PRINTF("escape found at offset %zu\n", *loc); + return 1; +} + +static really_inline +char lbrFwdScanDot(UNUSED const struct NFA *nfa, UNUSED const u8 *buf, + UNUSED size_t begin, UNUSED size_t end, + UNUSED size_t *loc) { + assert(begin <= end); + assert(nfa->type == LBR_NFA_DOT); + // Nothing can kill a dot! + return 0; +} + +static really_inline +char lbrFwdScanVerm(const struct NFA *nfa, const u8 *buf, + size_t begin, size_t end, size_t *loc) { + assert(begin <= end); + assert(nfa->type == LBR_NFA_VERM); + const struct lbr_verm *l = getImplNfa(nfa); + + if (begin == end) { + return 0; + } + + const u8 *ptr = vermicelliExec(l->c, 0, buf + begin, buf + end); + if (ptr == buf + end) { + DEBUG_PRINTF("no escape found\n"); + return 0; + } + + assert(loc); + *loc = (size_t)(ptr - buf); + DEBUG_PRINTF("escape found at offset %zu\n", *loc); + assert((char)*ptr == l->c); + return 1; +} + +static really_inline +char lbrFwdScanNVerm(const struct NFA *nfa, const u8 *buf, + size_t begin, size_t end, size_t *loc) { + assert(begin <= end); + assert(nfa->type == LBR_NFA_NVERM); + const struct lbr_verm *l = getImplNfa(nfa); + + if (begin == end) { + return 0; + } + + const u8 *ptr = nvermicelliExec(l->c, 0, buf + begin, buf + end); + if (ptr == buf + end) { + DEBUG_PRINTF("no escape found\n"); + return 0; + } + + assert(loc); + *loc = (size_t)(ptr - buf); + DEBUG_PRINTF("escape found at offset %zu\n", *loc); + assert((char)*ptr != l->c); + return 1; +} + +static really_inline +char lbrFwdScanShuf(const struct NFA *nfa, const u8 *buf, + size_t begin, size_t end, + size_t *loc) { + assert(begin <= end); + assert(nfa->type == LBR_NFA_SHUF); + const struct lbr_shuf *l = getImplNfa(nfa); + + if (begin == end) { + return 0; + } + + const u8 *ptr = shuftiExec(l->mask_lo, l->mask_hi, buf + begin, buf + end); + if (ptr == buf + end) { + DEBUG_PRINTF("no escape found\n"); + return 0; + } + + assert(loc); + *loc = (size_t)(ptr - buf); + DEBUG_PRINTF("escape found at offset %zu\n", *loc); + return 1; +} + +static really_inline +char lbrFwdScanTruf(const struct NFA *nfa, const u8 *buf, + size_t begin, size_t end, + size_t *loc) { + assert(begin <= end); + assert(nfa->type == LBR_NFA_TRUF); + const struct lbr_truf *l = getImplNfa(nfa); + + if (begin == end) { + return 0; + } + + const u8 *ptr = truffleExec(l->mask1, l->mask2, buf + begin, buf + end); + if (ptr == buf + end) { + DEBUG_PRINTF("no escape found\n"); + return 0; + } + + assert(loc); + *loc = (size_t)(ptr - buf); + DEBUG_PRINTF("escape found at offset %zu\n", *loc); + return 1; +} + +#define ENGINE_ROOT_NAME Dot +#include "lbr_common_impl.h" + +#define ENGINE_ROOT_NAME Verm +#include "lbr_common_impl.h" + +#define ENGINE_ROOT_NAME NVerm +#include "lbr_common_impl.h" + +#define ENGINE_ROOT_NAME Shuf +#include "lbr_common_impl.h" + +#define ENGINE_ROOT_NAME Truf +#include "lbr_common_impl.h" diff --git a/regex/nfa/lbr.h b/regex/nfa/lbr.h new file mode 100644 index 000000000..a9e42046d --- /dev/null +++ b/regex/nfa/lbr.h @@ -0,0 +1,150 @@ +/* + * Copyright (c) 2015-2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef LBR_H +#define LBR_H + +#include "ue2common.h" + +struct mq; +struct NFA; + +#ifdef __cplusplus +extern "C" +{ +#endif + +// LBR Dot + +char nfaExecLbrDot_Q(const struct NFA *n, struct mq *q, s64a end); +char nfaExecLbrDot_Q2(const struct NFA *n, struct mq *q, s64a end); +char nfaExecLbrDot_QR(const struct NFA *n, struct mq *q, ReportID report); +char nfaExecLbrDot_reportCurrent(const struct NFA *n, struct mq *q); +char nfaExecLbrDot_inAccept(const struct NFA *n, ReportID report, struct mq *q); +char nfaExecLbrDot_inAnyAccept(const struct NFA *n, struct mq *q); +char nfaExecLbrDot_queueInitState(const struct NFA *n, struct mq *q); +char nfaExecLbrDot_initCompressedState(const struct NFA *n, u64a offset, + void *state, u8 key); +char nfaExecLbrDot_queueCompressState(const struct NFA *nfa, const struct mq *q, + s64a loc); +char nfaExecLbrDot_expandState(const struct NFA *nfa, void *dest, + const void *src, u64a offset, u8 key); + +#define nfaExecLbrDot_testEOD NFA_API_NO_IMPL +#define nfaExecLbrDot_B_Reverse NFA_API_NO_IMPL +#define nfaExecLbrDot_zombie_status NFA_API_ZOMBIE_NO_IMPL + +// LBR Verm + +char nfaExecLbrVerm_Q(const struct NFA *n, struct mq *q, s64a end); +char nfaExecLbrVerm_Q2(const struct NFA *n, struct mq *q, s64a end); +char nfaExecLbrVerm_QR(const struct NFA *n, struct mq *q, ReportID report); +char nfaExecLbrVerm_reportCurrent(const struct NFA *n, struct mq *q); +char nfaExecLbrVerm_inAccept(const struct NFA *n, ReportID report, + struct mq *q); +char nfaExecLbrVerm_inAnyAccept(const struct NFA *n, struct mq *q); +char nfaExecLbrVerm_queueInitState(const struct NFA *n, struct mq *q); +char nfaExecLbrVerm_initCompressedState(const struct NFA *n, u64a offset, + void *state, u8 key); +char nfaExecLbrVerm_queueCompressState(const struct NFA *nfa, + const struct mq *q, s64a loc); +char nfaExecLbrVerm_expandState(const struct NFA *nfa, void *dest, + const void *src, u64a offset, u8 key); + +#define nfaExecLbrVerm_testEOD NFA_API_NO_IMPL +#define nfaExecLbrVerm_B_Reverse NFA_API_NO_IMPL +#define nfaExecLbrVerm_zombie_status NFA_API_ZOMBIE_NO_IMPL + +// LBR Negated Verm + +char nfaExecLbrNVerm_Q(const struct NFA *n, struct mq *q, s64a end); +char nfaExecLbrNVerm_Q2(const struct NFA *n, struct mq *q, s64a end); +char nfaExecLbrNVerm_QR(const struct NFA *n, struct mq *q, ReportID report); +char nfaExecLbrNVerm_reportCurrent(const struct NFA *n, struct mq *q); +char nfaExecLbrNVerm_inAccept(const struct NFA *n, ReportID report, + struct mq *q); +char nfaExecLbrNVerm_inAnyAccept(const struct NFA *n, struct mq *q); +char nfaExecLbrNVerm_queueInitState(const struct NFA *n, struct mq *q); +char nfaExecLbrNVerm_initCompressedState(const struct NFA *n, u64a offset, + void *state, u8 key); +char nfaExecLbrNVerm_queueCompressState(const struct NFA *nfa, + const struct mq *q, s64a loc); +char nfaExecLbrNVerm_expandState(const struct NFA *nfa, void *dest, + const void *src, u64a offset, u8 key); + +#define nfaExecLbrNVerm_testEOD NFA_API_NO_IMPL +#define nfaExecLbrNVerm_B_Reverse NFA_API_NO_IMPL +#define nfaExecLbrNVerm_zombie_status NFA_API_ZOMBIE_NO_IMPL + +// LBR Shuf + +char nfaExecLbrShuf_Q(const struct NFA *n, struct mq *q, s64a end); +char nfaExecLbrShuf_Q2(const struct NFA *n, struct mq *q, s64a end); +char nfaExecLbrShuf_QR(const struct NFA *n, struct mq *q, ReportID report); +char nfaExecLbrShuf_reportCurrent(const struct NFA *n, struct mq *q); +char nfaExecLbrShuf_inAccept(const struct NFA *n, ReportID report, + struct mq *q); +char nfaExecLbrShuf_inAnyAccept(const struct NFA *n, struct mq *q); +char nfaExecLbrShuf_queueInitState(const struct NFA *n, struct mq *q); +char nfaExecLbrShuf_initCompressedState(const struct NFA *n, u64a offset, + void *state, u8 key); +char nfaExecLbrShuf_queueCompressState(const struct NFA *nfa, + const struct mq *q, s64a loc); +char nfaExecLbrShuf_expandState(const struct NFA *nfa, void *dest, + const void *src, u64a offset, u8 key); + +#define nfaExecLbrShuf_testEOD NFA_API_NO_IMPL +#define nfaExecLbrShuf_B_Reverse NFA_API_NO_IMPL +#define nfaExecLbrShuf_zombie_status NFA_API_ZOMBIE_NO_IMPL + +// LBR Truffle + +char nfaExecLbrTruf_Q(const struct NFA *n, struct mq *q, s64a end); +char nfaExecLbrTruf_Q2(const struct NFA *n, struct mq *q, s64a end); +char nfaExecLbrTruf_QR(const struct NFA *n, struct mq *q, ReportID report); +char nfaExecLbrTruf_reportCurrent(const struct NFA *n, struct mq *q); +char nfaExecLbrTruf_inAccept(const struct NFA *n, ReportID report, + struct mq *q); +char nfaExecLbrTruf_inAnyAccept(const struct NFA *n, struct mq *q); +char nfaExecLbrTruf_queueInitState(const struct NFA *n, struct mq *q); +char nfaExecLbrTruf_initCompressedState(const struct NFA *n, u64a offset, + void *state, u8 key); +char nfaExecLbrTruf_queueCompressState(const struct NFA *nfa, + const struct mq *q, s64a loc); +char nfaExecLbrTruf_expandState(const struct NFA *nfa, void *dest, + const void *src, u64a offset, u8 key); + +#define nfaExecLbrTruf_testEOD NFA_API_NO_IMPL +#define nfaExecLbrTruf_B_Reverse NFA_API_NO_IMPL +#define nfaExecLbrTruf_zombie_status NFA_API_ZOMBIE_NO_IMPL + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/regex/nfa/lbr_common_impl.h b/regex/nfa/lbr_common_impl.h new file mode 100644 index 000000000..5ae35431e --- /dev/null +++ b/regex/nfa/lbr_common_impl.h @@ -0,0 +1,462 @@ +/* + * Copyright (c) 2015-2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Large Bounded Repeat (LBR) engine: runtime impl X-macros. + */ + +#include "util/join.h" + +#define ENGINE_EXEC_NAME JOIN(nfaExecLbr, ENGINE_ROOT_NAME) +#define EXEC_FN JOIN(lbrExec, ENGINE_ROOT_NAME) +#define FWDSCAN_FN JOIN(lbrFwdScan, ENGINE_ROOT_NAME) +#define REVSCAN_FN JOIN(lbrRevScan, ENGINE_ROOT_NAME) + +char JOIN(ENGINE_EXEC_NAME, _queueCompressState)(const struct NFA *nfa, + const struct mq *q, s64a loc) { + assert(nfa && q); + assert(isLbrType(nfa->type)); + DEBUG_PRINTF("entry, q->offset=%llu, loc=%lld\n", q->offset, loc); + + const struct lbr_common *l = getImplNfa(nfa); + const struct lbr_state *lstate = (const struct lbr_state *)q->state; + + u64a offset = q->offset + loc; + lbrCompressState(l, offset, lstate, q->streamState); + return 0; +} + +char JOIN(ENGINE_EXEC_NAME, _expandState)(const struct NFA *nfa, void *dest, + const void *src, u64a offset, + UNUSED u8 key) { + assert(nfa); + assert(isLbrType(nfa->type)); + DEBUG_PRINTF("entry, offset=%llu\n", offset); + + const struct lbr_common *l = getImplNfa(nfa); + struct lbr_state *lstate = (struct lbr_state *)dest; + lbrExpandState(l, offset, src, lstate); + return 0; +} + +char JOIN(ENGINE_EXEC_NAME, _reportCurrent)(const struct NFA *nfa, + struct mq *q) { + assert(nfa && q); + assert(isLbrType(nfa->type)); + + const struct lbr_common *l = getImplNfa(nfa); + u64a offset = q_cur_offset(q); + DEBUG_PRINTF("firing match %u at %llu\n", l->report, offset); + q->cb(0, offset, l->report, q->context); + return 0; +} + +char JOIN(ENGINE_EXEC_NAME, _inAccept)(const struct NFA *nfa, + ReportID report, struct mq *q) { + assert(nfa && q); + assert(isLbrType(nfa->type)); + DEBUG_PRINTF("entry\n"); + + const struct lbr_common *l = getImplNfa(nfa); + const struct RepeatInfo *info = getRepeatInfo(l); + const struct lbr_state *lstate = (const struct lbr_state *)q->state; + if (repeatIsDead(info, lstate)) { + DEBUG_PRINTF("repeat is dead\n"); + return 0; + } + + u64a offset = q->offset + q_last_loc(q); + return lbrInAccept(l, lstate, q->streamState, offset, report); +} + +char JOIN(ENGINE_EXEC_NAME, _inAnyAccept)(const struct NFA *nfa, struct mq *q) { + assert(nfa && q); + assert(isLbrType(nfa->type)); + DEBUG_PRINTF("entry\n"); + + const struct lbr_common *l = getImplNfa(nfa); + return JOIN(ENGINE_EXEC_NAME, _inAccept)(nfa, l->report, q); +} + +char JOIN(ENGINE_EXEC_NAME, _queueInitState)(const struct NFA *nfa, + struct mq *q) { + assert(nfa && q); + assert(isLbrType(nfa->type)); + DEBUG_PRINTF("entry\n"); + + const struct lbr_common *l = getImplNfa(nfa); + const struct RepeatInfo *info = getRepeatInfo(l); + + assert(q->state); + struct lbr_state *lstate = (struct lbr_state *)q->state; + assert(ISALIGNED(lstate)); + + lstate->lastEscape = 0; + clearRepeat(info, lstate); + + return 0; +} + +char JOIN(ENGINE_EXEC_NAME, _initCompressedState)(const struct NFA *nfa, + u64a offset, + void *state, UNUSED u8 key) { + assert(nfa && state); + assert(isLbrType(nfa->type)); + DEBUG_PRINTF("entry\n"); + + const struct lbr_common *l = getImplNfa(nfa); + const struct RepeatInfo *info = getRepeatInfo(l); + struct lbr_state lstate; // temp control block on stack. + clearRepeat(info, &lstate); + lbrTop(l, &lstate, state, offset); + lbrCompressState(l, offset, &lstate, state); + + return 1; // LBR is alive +} + +// FIXME: this function could be much simpler for a Dot LBR, as all it needs to +// do is find the next top. +static really_inline +char JOIN(ENGINE_EXEC_NAME, _TopScan)(const struct NFA *nfa, struct mq *q, + s64a end) { + const struct lbr_common *l = getImplNfa(nfa); + const struct RepeatInfo *info = getRepeatInfo(l); + + const u64a offset = q->offset; + struct lbr_state *lstate = (struct lbr_state *)q->state; + assert(ISALIGNED(lstate)); + + assert(repeatIsDead(info, lstate)); + assert(q->cur < q->end); + + DEBUG_PRINTF("entry, end=%lld, offset=%llu, lastEscape=%llu\n", end, + offset, lstate->lastEscape); + + while (1) { + // Find the next top with location >= the last escape we saw. + for (; q->cur < q->end && q_cur_loc(q) <= end; q->cur++) { + u32 event = q_cur_type(q); + if ((event == MQE_TOP || event == MQE_TOP_FIRST) && + q_cur_offset(q) >= lstate->lastEscape) { + goto found_top; + } + DEBUG_PRINTF("skip event type=%u offset=%lld\n", event, q_cur_offset(q)); + } + + // No more tops, we're done. + break; + +found_top:; + assert(q->cur < q->end); + + u64a sp = q_cur_offset(q); + u64a first_match = sp + info->repeatMin; + DEBUG_PRINTF("first possible match is at %llu\n", first_match); + + u64a ep = MIN(MIN(end, (s64a)q->length) + offset, first_match); + if (ep > sp && sp >= offset) { + size_t eloc; + DEBUG_PRINTF("rev b%llu e%llu/%zu\n", sp - offset, ep - offset, + q->length); + assert(ep - offset <= q->length); + if (REVSCAN_FN(nfa, q->buffer, sp - offset, ep - offset, &eloc)) { + DEBUG_PRINTF("escape found at %llu\n", offset + eloc); + lstate->lastEscape = eloc; + q->cur++; + continue; + } + } + + lbrTop(l, lstate, q->streamState, sp); + return 1; + } + + DEBUG_PRINTF("exhausted queue\n"); + return 0; +} + +static really_inline +char JOIN(ENGINE_EXEC_NAME, _Q_i)(const struct NFA *nfa, struct mq *q, + s64a end, enum MatchMode mode) { + assert(nfa && q); + assert(isLbrType(nfa->type)); + + const struct lbr_common *l = getImplNfa(nfa); + const struct RepeatInfo *info = getRepeatInfo(l); + + struct lbr_state *lstate = (struct lbr_state *)q->state; + assert(ISALIGNED(lstate)); + + + if (q->report_current) { + DEBUG_PRINTF("report_current: fire match at %llu\n", q_cur_offset(q)); + int rv = q->cb(0, q_cur_offset(q), l->report, q->context); + q->report_current = 0; + if (rv == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } + + if (q->cur == q->end) { + return 1; + } + + assert(q->cur + 1 < q->end); /* require at least two items */ + assert(q_cur_type(q) == MQE_START); + u64a sp = q_cur_offset(q); + q->cur++; + DEBUG_PRINTF("sp=%llu, abs_end=%llu\n", sp, end + q->offset); + + while (q->cur < q->end) { + DEBUG_PRINTF("q item type=%d offset=%llu\n", q_cur_type(q), + q_cur_offset(q)); + + assert(sp >= q->offset); // not in history + + if (repeatIsDead(info, lstate)) { + DEBUG_PRINTF("repeat is currently dead, skipping scan\n"); + goto scan_done; + } + + u64a ep = q_cur_offset(q); + ep = MIN(ep, q->offset + end); + if (sp < ep) { + size_t eloc = 0; + char escape_found = 0; + DEBUG_PRINTF("scanning from sp=%llu to ep=%llu\n", sp, ep); + assert(sp >= q->offset && ep >= q->offset); + if (FWDSCAN_FN(nfa, q->buffer, sp - q->offset, ep - q->offset, &eloc)) { + escape_found = 1; + ep = q->offset + eloc; + DEBUG_PRINTF("escape found at %llu\n", ep); + assert(ep >= sp); + } + + assert(sp <= ep); + + if (mode == STOP_AT_MATCH) { + size_t mloc; + if (lbrFindMatch(l, sp, ep, lstate, q->streamState, &mloc)) { + DEBUG_PRINTF("storing match at %llu\n", sp + mloc); + q->cur--; + assert(q->cur < MAX_MQE_LEN); + q->items[q->cur].type = MQE_START; + q->items[q->cur].location = (s64a)(sp - q->offset) + mloc; + return MO_MATCHES_PENDING; + } + } else { + assert(mode == CALLBACK_OUTPUT); + char rv = lbrMatchLoop(l, sp, ep, lstate, q->streamState, q->cb, + q->context); + if (rv == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + assert(rv == MO_CONTINUE_MATCHING); + } + + if (escape_found) { + DEBUG_PRINTF("clearing repeat due to escape\n"); + clearRepeat(info, lstate); + } + } + + scan_done: + if (q_cur_loc(q) > end) { + q->cur--; + assert(q->cur < MAX_MQE_LEN); + q->items[q->cur].type = MQE_START; + q->items[q->cur].location = end; + return MO_ALIVE; + } + + if (repeatIsDead(info, lstate)) { + if (!JOIN(ENGINE_EXEC_NAME, _TopScan)(nfa, q, end)) { + assert(repeatIsDead(info, lstate)); + if (q->cur < q->end && q_cur_loc(q) > end) { + q->cur--; + assert(q->cur < MAX_MQE_LEN); + q->items[q->cur].type = MQE_START; + q->items[q->cur].location = end; + return MO_ALIVE; + } + return 0; + } + DEBUG_PRINTF("cur offset = %llu\n", q_cur_offset(q)); + } else { + switch (q_cur_type(q)) { + case MQE_TOP: + case MQE_TOP_FIRST: + lbrTop(l, lstate, q->streamState, q_cur_offset(q)); + break; + case MQE_START: + case MQE_END: + break; + default: + DEBUG_PRINTF("unhandled event %d!\n", q_cur_type(q)); + assert(0); + break; + } + } + + sp = q_cur_offset(q); + q->cur++; + } + + return lbrIsAlive(l, lstate, q->streamState, sp); +} + +char JOIN(ENGINE_EXEC_NAME, _Q)(const struct NFA *nfa, struct mq *q, s64a end) { + DEBUG_PRINTF("entry, offset=%llu, end=%lld\n", q->offset, end); + return JOIN(ENGINE_EXEC_NAME, _Q_i)(nfa, q, end, CALLBACK_OUTPUT); +} + +char JOIN(ENGINE_EXEC_NAME, _Q2)(const struct NFA *nfa, struct mq *q, s64a end) { + DEBUG_PRINTF("entry, offset=%llu, end=%lld\n", q->offset, end); + return JOIN(ENGINE_EXEC_NAME, _Q_i)(nfa, q, end, STOP_AT_MATCH); +} + +static really_inline +void JOIN(ENGINE_EXEC_NAME, _StreamSilent)(const struct NFA *nfa, struct mq *q, + const u8 *buf, size_t length) { + const struct lbr_common *l = getImplNfa(nfa); + const struct RepeatInfo *info = getRepeatInfo(l); + struct lbr_state *lstate = (struct lbr_state *)q->state; + assert(ISALIGNED(lstate)); + + assert(!repeatIsDead(info, lstate)); + + // This call doesn't produce matches, so we elide the lbrMatchLoop call + // entirely and just do escape scans to maintain the repeat. + + size_t eloc = 0; + char escaped = FWDSCAN_FN(nfa, buf, 0, length, &eloc); + if (escaped) { + assert(eloc < length); + DEBUG_PRINTF("escape found at %zu, clearing repeat\n", eloc); + clearRepeat(info, lstate); + } +} + +// Rose infix path. +char JOIN(ENGINE_EXEC_NAME, _QR)(const struct NFA *nfa, struct mq *q, + ReportID report) { + assert(nfa && q); + assert(isLbrType(nfa->type)); + + if (q->cur == q->end) { + return 1; + } + + assert(q->cur + 1 < q->end); /* require at least two items */ + assert(q_cur_type(q) == MQE_START); + u64a sp = q_cur_offset(q); + q->cur++; + DEBUG_PRINTF("sp=%llu\n", sp); + + const struct lbr_common *l = getImplNfa(nfa); + const struct RepeatInfo *info = getRepeatInfo(l); + struct lbr_state *lstate = (struct lbr_state *)q->state; + assert(ISALIGNED(lstate)); + const s64a lastLoc = q_last_loc(q); + + while (q->cur < q->end) { + DEBUG_PRINTF("q item type=%d offset=%llu\n", q_cur_type(q), + q_cur_offset(q)); + + if (repeatIsDead(info, lstate)) { + DEBUG_PRINTF("repeat is dead\n"); + goto scan_done; + } + + u64a ep = q_cur_offset(q); + + if (sp < q->offset) { + DEBUG_PRINTF("HISTORY BUFFER SCAN\n"); + assert(q->offset - sp <= q->hlength); + u64a local_ep = MIN(q->offset, ep); + const u8 *ptr = q->history + q->hlength + sp - q->offset; + JOIN(ENGINE_EXEC_NAME, _StreamSilent)(nfa, q, ptr, local_ep - sp); + sp = local_ep; + } + + if (repeatIsDead(info, lstate)) { + DEBUG_PRINTF("repeat is dead\n"); + goto scan_done; + } + + if (sp < ep) { + DEBUG_PRINTF("MAIN BUFFER SCAN\n"); + assert(ep - q->offset <= q->length); + const u8 *ptr = q->buffer + sp - q->offset; + JOIN(ENGINE_EXEC_NAME, _StreamSilent)(nfa, q, ptr, ep - sp); + } + + if (repeatIsDead(info, lstate)) { +scan_done: + if (!JOIN(ENGINE_EXEC_NAME, _TopScan)(nfa, q, lastLoc)) { + assert(repeatIsDead(info, lstate)); + assert(q->cur == q->end); + return 0; + } + } else { + switch (q_cur_type(q)) { + case MQE_TOP: + case MQE_TOP_FIRST: + lbrTop(l, lstate, q->streamState, q_cur_offset(q)); + break; + case MQE_START: + case MQE_END: + break; + default: + DEBUG_PRINTF("unhandled event %d!\n", q_cur_type(q)); + assert(0); + break; + } + } + + sp = q_cur_offset(q); + q->cur++; + } + + if (repeatIsDead(info, lstate)) { + DEBUG_PRINTF("repeat is dead\n"); + return 0; + } + + if (lbrInAccept(l, lstate, q->streamState, sp, report)) { + return MO_MATCHES_PENDING; + } + + return lbrIsActive(l, lstate, q->streamState, sp); +} + +#undef ENGINE_EXEC_NAME +#undef EXEC_FN +#undef FWDSCAN_FN +#undef REVSCAN_FN +#undef ENGINE_ROOT_NAME diff --git a/regex/nfa/lbr_internal.h b/regex/nfa/lbr_internal.h new file mode 100644 index 000000000..8ba11dd4d --- /dev/null +++ b/regex/nfa/lbr_internal.h @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2015, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Large Bounded Repeat (LBR): data structures. + */ + +#ifndef LBR_INTERNAL_H +#define LBR_INTERNAL_H + +#ifdef __cplusplus +extern "C" +{ +#endif + +#include "repeat_internal.h" + +/** \brief Common LBR header. */ +struct lbr_common { + u32 repeatInfoOffset; //!< offset of RepeatInfo structure relative + // to the start of lbr_common + ReportID report; //!< report to raise on match +}; + +struct lbr_dot { + struct lbr_common common; +}; + +struct lbr_verm { + struct lbr_common common; + char c; //!< escape char +}; + +struct lbr_shuf { + struct lbr_common common; + m128 mask_lo; //!< shufti lo mask for escape chars + m128 mask_hi; //!< shufti hi mask for escape chars +}; + +struct lbr_truf { + struct lbr_common common; + m128 mask1; + m128 mask2; +}; + +/** \brief Uncompressed ("full") state structure used by the LBR. This is + * stored in scratch, not in stream state. */ +struct lbr_state { + u64a lastEscape; //!< \brief offset of last escape seen. + union RepeatControl ctrl; //!< \brief repeat control block. */ +}; + +#ifdef __cplusplus +} +#endif + +#endif // LBR_INTERNAL_H diff --git a/regex/nfa/limex.h b/regex/nfa/limex.h new file mode 100644 index 000000000..0223604da --- /dev/null +++ b/regex/nfa/limex.h @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2015-2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef LIMEX_H +#define LIMEX_H + +#ifdef __cplusplus +#include +extern "C" +{ +#endif + +#include "nfa_api.h" + +#if defined(DUMP_SUPPORT) && defined(__cplusplus) +#define GENERATE_NFA_DUMP_DECL(gf_name) \ + } /* extern "C" */ \ + namespace ue2 { \ + void gf_name##_dump(const struct NFA *nfa, const std::string &base); \ + } /* namespace ue2 */ \ + extern "C" { + +#else +#define GENERATE_NFA_DUMP_DECL(gf_name) +#endif + +#define GENERATE_NFA_DECL(gf_name) \ + char gf_name##_testEOD(const struct NFA *nfa, const char *state, \ + const char *streamState, u64a offset, \ + NfaCallback callback, void *context); \ + char gf_name##_Q(const struct NFA *n, struct mq *q, s64a end); \ + char gf_name##_Q2(const struct NFA *n, struct mq *q, s64a end); \ + char gf_name##_QR(const struct NFA *n, struct mq *q, ReportID report); \ + char gf_name##_reportCurrent(const struct NFA *n, struct mq *q); \ + char gf_name##_inAccept(const struct NFA *n, ReportID report, \ + struct mq *q); \ + char gf_name##_inAnyAccept(const struct NFA *n, struct mq *q); \ + char gf_name##_queueInitState(const struct NFA *n, struct mq *q); \ + char gf_name##_initCompressedState(const struct NFA *n, u64a offset, \ + void *state, u8 key); \ + char gf_name##_B_Reverse(const struct NFA *n, u64a offset, const u8 *buf, \ + size_t buflen, const u8 *hbuf, size_t hlen, \ + NfaCallback cb, void *context); \ + char gf_name##_queueCompressState(const struct NFA *nfa, \ + const struct mq *q, s64a loc); \ + char gf_name##_expandState(const struct NFA *nfa, void *dest, \ + const void *src, u64a offset, u8 key); \ + enum nfa_zombie_status gf_name##_zombie_status(const struct NFA *nfa, \ + struct mq *q, s64a loc); \ + GENERATE_NFA_DUMP_DECL(gf_name) + +GENERATE_NFA_DECL(nfaExecLimEx32) +GENERATE_NFA_DECL(nfaExecLimEx64) +GENERATE_NFA_DECL(nfaExecLimEx128) +GENERATE_NFA_DECL(nfaExecLimEx256) +GENERATE_NFA_DECL(nfaExecLimEx384) +GENERATE_NFA_DECL(nfaExecLimEx512) + +#undef GENERATE_NFA_DECL +#undef GENERATE_NFA_DUMP_DECL + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/regex/nfa/limex_64.c b/regex/nfa/limex_64.c new file mode 100644 index 000000000..e8f0880b2 --- /dev/null +++ b/regex/nfa/limex_64.c @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2015-2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief LimEx NFA: 128-bit SIMD runtime implementations. + */ + +/* Limex64 is unusual on as on 32 bit platforms, at runtime it uses an m128 for + * state calculations. + */ + +//#define DEBUG_INPUT +//#define DEBUG_EXCEPTIONS + +#include "limex.h" + +#include "accel.h" +#include "limex_internal.h" +#include "nfa_internal.h" +#include "ue2common.h" +#include "util/bitutils.h" +#include "util/simd_utils.h" + +// Common code +#define STATE_ON_STACK +#define ESTATE_ON_STACK + +#include "limex_runtime.h" + +#define SIZE 64 +#define ENG_STATE_T u64a + +#ifdef ARCH_64_BIT +#define STATE_T u64a +#define LOAD_FROM_ENG load_u64a +#else +#define STATE_T m128 +#define LOAD_FROM_ENG load_m128_from_u64a +#endif + +#include "limex_exceptional.h" + +#include "limex_state_impl.h" + +#define INLINE_ATTR really_inline +#include "limex_common_impl.h" + +#include "limex_runtime_impl.h" diff --git a/regex/nfa/limex_accel.c b/regex/nfa/limex_accel.c new file mode 100644 index 000000000..4834b6a54 --- /dev/null +++ b/regex/nfa/limex_accel.c @@ -0,0 +1,170 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Limex NFA: acceleration runtime. + */ + +#include "limex_accel.h" + +#include "accel.h" +#include "limex_internal.h" +#include "limex_limits.h" +#include "limex_shuffle.h" +#include "nfa_internal.h" +#include "shufti.h" +#include "truffle.h" +#include "ue2common.h" +#include "vermicelli.h" +#include "util/arch.h" +#include "util/bitutils.h" +#include "util/simd_utils.h" + +static really_inline +size_t accelScanWrapper(const u8 *accelTable, const union AccelAux *aux, + const u8 *input, u32 idx, size_t i, size_t end) { + assert(accelTable); + assert(aux); + + DEBUG_PRINTF("shuffle returned %u -> aux %u\n", idx, accelTable[idx]); + assert(idx < (1 << NFA_MAX_ACCEL_STATES)); + if (!idx) { + return end; + } + + u8 aux_idx = accelTable[idx]; + if (!aux_idx) { + assert(aux[0].accel_type == ACCEL_NONE); + DEBUG_PRINTF("no accel, bailing\n"); + return i; + } + + aux = aux + aux_idx; + const u8 *ptr = run_accel(aux, &input[i], &input[end]); + assert(ptr >= &input[i]); + size_t j = (size_t)(ptr - input); + DEBUG_PRINTF("accel skipped %zu of %zu chars\n", (j - i), (end - i)); + DEBUG_PRINTF("returning j=%zu (i=%zu, end=%zu)\n", j, i, end); + return j; +} + +size_t doAccel32(u32 s, u32 accel, const u8 *accelTable, + const union AccelAux *aux, const u8 *input, size_t i, + size_t end) { + u32 idx = pext32(s, accel); + return accelScanWrapper(accelTable, aux, input, idx, i, end); +} + +#ifdef ARCH_64_BIT +size_t doAccel64(u64a s, u64a accel, const u8 *accelTable, + const union AccelAux *aux, const u8 *input, size_t i, + size_t end) { + u32 idx = pext64(s, accel); + return accelScanWrapper(accelTable, aux, input, idx, i, end); +} +#else +size_t doAccel64(m128 s, m128 accel, const u8 *accelTable, + const union AccelAux *aux, const u8 *input, size_t i, + size_t end) { + u32 idx = pext64(movq(s), movq(accel)); + return accelScanWrapper(accelTable, aux, input, idx, i, end); +} +#endif + +size_t doAccel128(const m128 *state, const struct LimExNFA128 *limex, + const u8 *accelTable, const union AccelAux *aux, + const u8 *input, size_t i, size_t end) { + u32 idx; + m128 s = *state; + DEBUG_PRINTF("using PSHUFB for 128-bit shuffle\n"); + m128 accelPerm = limex->accelPermute; + m128 accelComp = limex->accelCompare; + idx = packedExtract128(s, accelPerm, accelComp); + return accelScanWrapper(accelTable, aux, input, idx, i, end); +} + +size_t doAccel256(const m256 *state, const struct LimExNFA256 *limex, + const u8 *accelTable, const union AccelAux *aux, + const u8 *input, size_t i, size_t end) { + u32 idx; + m256 s = *state; + DEBUG_PRINTF("using PSHUFB for 256-bit shuffle\n"); + m256 accelPerm = limex->accelPermute; + m256 accelComp = limex->accelCompare; +#if !defined(HAVE_AVX2) + u32 idx1 = packedExtract128(s.lo, accelPerm.lo, accelComp.lo); + u32 idx2 = packedExtract128(s.hi, accelPerm.hi, accelComp.hi); + assert((idx1 & idx2) == 0); // should be no shared bits + idx = idx1 | idx2; +#else + idx = packedExtract256(s, accelPerm, accelComp); +#endif + return accelScanWrapper(accelTable, aux, input, idx, i, end); +} + +size_t doAccel384(const m384 *state, const struct LimExNFA384 *limex, + const u8 *accelTable, const union AccelAux *aux, + const u8 *input, size_t i, size_t end) { + u32 idx; + m384 s = *state; + DEBUG_PRINTF("using PSHUFB for 384-bit shuffle\n"); + m384 accelPerm = limex->accelPermute; + m384 accelComp = limex->accelCompare; + u32 idx1 = packedExtract128(s.lo, accelPerm.lo, accelComp.lo); + u32 idx2 = packedExtract128(s.mid, accelPerm.mid, accelComp.mid); + u32 idx3 = packedExtract128(s.hi, accelPerm.hi, accelComp.hi); + assert((idx1 & idx2 & idx3) == 0); // should be no shared bits + idx = idx1 | idx2 | idx3; + return accelScanWrapper(accelTable, aux, input, idx, i, end); +} + +size_t doAccel512(const m512 *state, const struct LimExNFA512 *limex, + const u8 *accelTable, const union AccelAux *aux, + const u8 *input, size_t i, size_t end) { + u32 idx; + m512 s = *state; + DEBUG_PRINTF("using PSHUFB for 512-bit shuffle\n"); + m512 accelPerm = limex->accelPermute; + m512 accelComp = limex->accelCompare; +#if defined(HAVE_AVX512) + idx = packedExtract512(s, accelPerm, accelComp); +#elif defined(HAVE_AVX2) + u32 idx1 = packedExtract256(s.lo, accelPerm.lo, accelComp.lo); + u32 idx2 = packedExtract256(s.hi, accelPerm.hi, accelComp.hi); + assert((idx1 & idx2) == 0); // should be no shared bits + idx = idx1 | idx2; +#else + u32 idx1 = packedExtract128(s.lo.lo, accelPerm.lo.lo, accelComp.lo.lo); + u32 idx2 = packedExtract128(s.lo.hi, accelPerm.lo.hi, accelComp.lo.hi); + u32 idx3 = packedExtract128(s.hi.lo, accelPerm.hi.lo, accelComp.hi.lo); + u32 idx4 = packedExtract128(s.hi.hi, accelPerm.hi.hi, accelComp.hi.hi); + assert((idx1 & idx2 & idx3 & idx4) == 0); // should be no shared bits + idx = idx1 | idx2 | idx3 | idx4; +#endif + return accelScanWrapper(accelTable, aux, input, idx, i, end); +} diff --git a/regex/nfa/limex_accel.h b/regex/nfa/limex_accel.h new file mode 100644 index 000000000..e5c94e82a --- /dev/null +++ b/regex/nfa/limex_accel.h @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2015-2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Limex NFA: acceleration runtime. + * + * For the SIMD types (128 bits and above), we pass a pointer to the + * implementation NFA structure instead of three masks: otherwise we spend all + * our time building stack frames. + */ + +#ifndef LIMEX_ACCEL_H +#define LIMEX_ACCEL_H + +#include "util/simd_utils.h" // for m128 etc + +union AccelAux; +struct LimExNFA64; +struct LimExNFA128; +struct LimExNFA256; +struct LimExNFA384; +struct LimExNFA512; + +size_t doAccel32(u32 s, u32 accel, const u8 *accelTable, + const union AccelAux *aux, const u8 *input, size_t i, + size_t end); + +#ifdef ARCH_64_BIT +size_t doAccel64(u64a s, u64a accel, const u8 *accelTable, + const union AccelAux *aux, const u8 *input, size_t i, + size_t end); +#else +size_t doAccel64(m128 s, m128 accel, const u8 *accelTable, + const union AccelAux *aux, const u8 *input, size_t i, + size_t end); +#endif + +size_t doAccel128(const m128 *s, const struct LimExNFA128 *limex, + const u8 *accelTable, const union AccelAux *aux, + const u8 *input, size_t i, size_t end); + +size_t doAccel256(const m256 *s, const struct LimExNFA256 *limex, + const u8 *accelTable, const union AccelAux *aux, + const u8 *input, size_t i, size_t end); + +size_t doAccel384(const m384 *s, const struct LimExNFA384 *limex, + const u8 *accelTable, const union AccelAux *aux, + const u8 *input, size_t i, size_t end); + +size_t doAccel512(const m512 *s, const struct LimExNFA512 *limex, + const u8 *accelTable, const union AccelAux *aux, + const u8 *input, size_t i, size_t end); + +#endif diff --git a/regex/nfa/limex_common_impl.h b/regex/nfa/limex_common_impl.h new file mode 100644 index 000000000..e441945d7 --- /dev/null +++ b/regex/nfa/limex_common_impl.h @@ -0,0 +1,431 @@ +/* + * Copyright (c) 2015-2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "repeat.h" +#include "util/join.h" + +/* impl of limex functions which depend only on state size */ + +#if !defined(SIZE) || !defined(STATE_T) || !defined(LOAD_FROM_ENG) \ + || !defined(INLINE_ATTR) +# error Must define SIZE, STATE_T, LOAD_FROM_ENG and INLINE_ATTR in includer. +#endif + +#define IMPL_NFA_T JOIN(struct LimExNFA, SIZE) + +#define TESTEOD_FN JOIN(moNfaTestEod, SIZE) +#define LIMEX_INACCEPT_FN JOIN(limexInAccept, SIZE) +#define LIMEX_INANYACCEPT_FN JOIN(limexInAnyAccept, SIZE) +#define EXPIRE_ESTATE_FN JOIN(limexExpireExtendedState, SIZE) +#define REPORTCURRENT_FN JOIN(moNfaReportCurrent, SIZE) +#define INITIAL_FN JOIN(moNfaInitial, SIZE) +#define TOP_FN JOIN(moNfaTop, SIZE) +#define TOPN_FN JOIN(moNfaTopN, SIZE) +#define PROCESS_ACCEPTS_IMPL_FN JOIN(moProcessAcceptsImpl, SIZE) +#define PROCESS_ACCEPTS_FN JOIN(moProcessAccepts, SIZE) +#define PROCESS_ACCEPTS_NOSQUASH_FN JOIN(moProcessAcceptsNoSquash, SIZE) +#define CONTEXT_T JOIN(NFAContext, SIZE) +#define ONES_STATE JOIN(ones_, STATE_T) +#define AND_STATE JOIN(and_, STATE_T) +#define OR_STATE JOIN(or_, STATE_T) +#define ANDNOT_STATE JOIN(andnot_, STATE_T) +#define CLEARBIT_STATE JOIN(clearbit_, STATE_T) +#define TESTBIT_STATE JOIN(testbit_, STATE_T) +#define ISNONZERO_STATE JOIN(isNonZero_, STATE_T) +#define ISZERO_STATE JOIN(isZero_, STATE_T) +#define SQUASH_UNTUG_BR_FN JOIN(lazyTug, SIZE) +#define GET_NFA_REPEAT_INFO_FN JOIN(getNfaRepeatInfo, SIZE) + +#if defined(ARCH_64_BIT) && (SIZE >= 64) +#define CHUNK_T u64a +#define FIND_AND_CLEAR_FN findAndClearLSB_64 +#define POPCOUNT_FN popcount64 +#define RANK_IN_MASK_FN rank_in_mask64 +#else +#define CHUNK_T u32 +#define FIND_AND_CLEAR_FN findAndClearLSB_32 +#define POPCOUNT_FN popcount32 +#define RANK_IN_MASK_FN rank_in_mask32 +#endif + +#define NUM_STATE_CHUNKS (sizeof(STATE_T) / sizeof(CHUNK_T)) + +static really_inline +void SQUASH_UNTUG_BR_FN(const IMPL_NFA_T *limex, + const union RepeatControl *repeat_ctrl, + const char *repeat_state, u64a offset, + STATE_T *accstate) { + // switch off cyclic tug-accepts which aren't tuggable right now. + + /* TODO: might be nice to work which br to examine based on accstate rather + * than iterating overall br */ + + if (!limex->repeatCount) { + return; + } + + assert(repeat_ctrl); + assert(repeat_state); + + for (u32 i = 0; i < limex->repeatCount; i++) { + const struct NFARepeatInfo *info = GET_NFA_REPEAT_INFO_FN(limex, i); + + u32 cyclicState = info->cyclicState; + if (!TESTBIT_STATE(*accstate, cyclicState)) { + continue; + } + + DEBUG_PRINTF("repeat %u (cyclic state %u) is active\n", i, cyclicState); + DEBUG_PRINTF("checking if offset %llu would match\n", offset); + + const union RepeatControl *ctrl = repeat_ctrl + i; + const char *state = repeat_state + info->stateOffset; + const struct RepeatInfo *repeat = getRepeatInfo(info); + if (repeatHasMatch(repeat, ctrl, state, offset) != REPEAT_MATCH) { + DEBUG_PRINTF("not ready to accept yet\n"); + CLEARBIT_STATE(accstate, cyclicState); + } + } +} + +static really_inline +char PROCESS_ACCEPTS_IMPL_FN(const IMPL_NFA_T *limex, const STATE_T *s, + STATE_T *squash, const STATE_T *acceptMask, + const struct NFAAccept *acceptTable, u64a offset, + NfaCallback callback, void *context) { + assert(s); + assert(limex); + assert(callback); + + const STATE_T accept_mask = *acceptMask; + STATE_T accepts = AND_STATE(*s, accept_mask); + + // Caller must ensure that we have at least one accept state on. + assert(ISNONZERO_STATE(accepts)); + + CHUNK_T chunks[NUM_STATE_CHUNKS]; + memcpy(chunks, &accepts, sizeof(accepts)); + + CHUNK_T mask_chunks[NUM_STATE_CHUNKS]; + memcpy(mask_chunks, &accept_mask, sizeof(accept_mask)); + + u32 base_index = 0; // Cumulative sum of mask popcount up to current chunk. + for (u32 i = 0; i < NUM_STATE_CHUNKS; i++) { + CHUNK_T chunk = chunks[i]; + while (chunk != 0) { + u32 bit = FIND_AND_CLEAR_FN(&chunk); + u32 local_idx = RANK_IN_MASK_FN(mask_chunks[i], bit); + u32 idx = local_idx + base_index; + const struct NFAAccept *a = &acceptTable[idx]; + DEBUG_PRINTF("state %u: firing report list=%u, offset=%llu\n", + bit + i * (u32)sizeof(chunk) * 8, a->reports, offset); + int rv = limexRunAccept((const char *)limex, a, callback, context, + offset); + if (unlikely(rv == MO_HALT_MATCHING)) { + return 1; + } + if (squash != NULL && a->squash != MO_INVALID_IDX) { + DEBUG_PRINTF("applying squash mask at offset %u\n", a->squash); + const ENG_STATE_T *sq = + (const ENG_STATE_T *)((const char *)limex + a->squash); + *squash = AND_STATE(*squash, LOAD_FROM_ENG(sq)); + } + } + base_index += POPCOUNT_FN(mask_chunks[i]); + } + + return 0; +} + +static never_inline +char PROCESS_ACCEPTS_FN(const IMPL_NFA_T *limex, STATE_T *s, + const STATE_T *acceptMask, + const struct NFAAccept *acceptTable, u64a offset, + NfaCallback callback, void *context) { + // We have squash masks we might have to apply after firing reports. + STATE_T squash = ONES_STATE; + return PROCESS_ACCEPTS_IMPL_FN(limex, s, &squash, acceptMask, acceptTable, + offset, callback, context); + + *s = AND_STATE(*s, squash); +} + +static never_inline +char PROCESS_ACCEPTS_NOSQUASH_FN(const IMPL_NFA_T *limex, const STATE_T *s, + const STATE_T *acceptMask, + const struct NFAAccept *acceptTable, + u64a offset, NfaCallback callback, + void *context) { + STATE_T *squash = NULL; + return PROCESS_ACCEPTS_IMPL_FN(limex, s, squash, acceptMask, acceptTable, + offset, callback, context); +} + +// Run EOD accepts. Note that repeat_ctrl and repeat_state may be NULL if this +// LimEx contains no repeat structures. +static really_inline +char TESTEOD_FN(const IMPL_NFA_T *limex, const STATE_T *s, + const union RepeatControl *repeat_ctrl, + const char *repeat_state, u64a offset, + NfaCallback callback, void *context) { + assert(limex && s); + + // There may not be any EOD accepts in this NFA. + if (!limex->acceptEodCount) { + return MO_CONTINUE_MATCHING; + } + + const STATE_T acceptEodMask = LOAD_FROM_ENG(&limex->acceptAtEOD); + STATE_T foundAccepts = AND_STATE(*s, acceptEodMask); + + SQUASH_UNTUG_BR_FN(limex, repeat_ctrl, repeat_state, + offset + 1 /* EOD 'symbol' */, &foundAccepts); + + if (unlikely(ISNONZERO_STATE(foundAccepts))) { + const struct NFAAccept *acceptEodTable = getAcceptEodTable(limex); + if (PROCESS_ACCEPTS_NOSQUASH_FN(limex, &foundAccepts, &acceptEodMask, + acceptEodTable, offset, callback, + context)) { + return MO_HALT_MATCHING; + } + } + + return MO_CONTINUE_MATCHING; +} + +// Run accepts corresponding to current state. +static really_inline +char REPORTCURRENT_FN(const IMPL_NFA_T *limex, const struct mq *q) { + assert(limex && q); + assert(q->state); + assert(q_cur_type(q) == MQE_START); + + STATE_T s = *(STATE_T *)q->state; + STATE_T acceptMask = LOAD_FROM_ENG(&limex->accept); + STATE_T foundAccepts = AND_STATE(s, acceptMask); + + if (unlikely(ISNONZERO_STATE(foundAccepts))) { + DEBUG_PRINTF("found accepts\n"); + DEBUG_PRINTF("for nfa %p\n", limex); + const struct NFAAccept *acceptTable = getAcceptTable(limex); + u64a offset = q_cur_offset(q); + + if (PROCESS_ACCEPTS_NOSQUASH_FN(limex, &foundAccepts, &acceptMask, + acceptTable, offset, q->cb, + q->context)) { + return MO_HALT_MATCHING; + } + } + + return MO_CONTINUE_MATCHING; +} + +static really_inline +STATE_T INITIAL_FN(const IMPL_NFA_T *impl, char onlyDs) { + return LOAD_FROM_ENG(onlyDs ? &impl->initDS : &impl->init); +} + +static really_inline +STATE_T TOP_FN(const IMPL_NFA_T *impl, char onlyDs, STATE_T state) { + return OR_STATE(INITIAL_FN(impl, onlyDs), state); +} + +static really_inline +STATE_T TOPN_FN(const IMPL_NFA_T *limex, STATE_T state, u32 n) { + assert(n < limex->topCount); + const ENG_STATE_T *topsptr = + (const ENG_STATE_T *)((const char *)limex + limex->topOffset); + STATE_T top = LOAD_FROM_ENG(&topsptr[n]); + return OR_STATE(top, state); +} + +static really_inline +void EXPIRE_ESTATE_FN(const IMPL_NFA_T *limex, struct CONTEXT_T *ctx, + u64a offset) { + assert(limex); + assert(ctx); + + if (!limex->repeatCount) { + return; + } + + DEBUG_PRINTF("expire estate at offset %llu\n", offset); + + const STATE_T cyclics + = AND_STATE(ctx->s, LOAD_FROM_ENG(&limex->repeatCyclicMask)); + if (ISZERO_STATE(cyclics)) { + DEBUG_PRINTF("no cyclic states are on\n"); + return; + } + + for (u32 i = 0; i < limex->repeatCount; i++) { + const struct NFARepeatInfo *info = GET_NFA_REPEAT_INFO_FN(limex, i); + + u32 cyclicState = info->cyclicState; + if (!TESTBIT_STATE(cyclics, cyclicState)) { + continue; + } + + DEBUG_PRINTF("repeat %u (cyclic state %u) is active\n", i, + cyclicState); + + const struct RepeatInfo *repeat = getRepeatInfo(info); + if (repeat->repeatMax == REPEAT_INF) { + continue; // can't expire + } + + const union RepeatControl *repeat_ctrl = ctx->repeat_ctrl + i; + const char *repeat_state = ctx->repeat_state + info->stateOffset; + u64a last_top = repeatLastTop(repeat, repeat_ctrl, repeat_state); + assert(repeat->repeatMax < REPEAT_INF); + DEBUG_PRINTF("offset %llu, last_top %llu repeatMax %u\n", offset, + last_top, repeat->repeatMax); + u64a adj = 0; + /* if the cycle's tugs are active at repeat max, it is still alive */ + if (TESTBIT_STATE(LOAD_FROM_ENG(&limex->accept), cyclicState) || + TESTBIT_STATE(LOAD_FROM_ENG(&limex->acceptAtEOD), cyclicState)) { + DEBUG_PRINTF("lazy tug possible - may still be inspected\n"); + adj = 1; + } else { + const ENG_STATE_T *tug_mask = + (const ENG_STATE_T *)((const char *)info + info->tugMaskOffset); + if (ISNONZERO_STATE(AND_STATE(ctx->s, LOAD_FROM_ENG(tug_mask)))) { + DEBUG_PRINTF("tug possible - may still be inspected\n"); + adj = 1; + } + } + + if (offset >= last_top + repeat->repeatMax + adj) { + DEBUG_PRINTF("repeat state is stale, squashing state %u\n", + cyclicState); + CLEARBIT_STATE(&ctx->s, cyclicState); + } + } +} + +// Specialised inAccept call: LimEx NFAs with the "lazy tug" optimisation (see +// UE-1636) need to guard cyclic tug-accepts as well. +static really_inline +char LIMEX_INACCEPT_FN(const IMPL_NFA_T *limex, STATE_T state, + union RepeatControl *repeat_ctrl, char *repeat_state, + u64a offset, ReportID report) { + assert(limex); + + const STATE_T accept_mask = LOAD_FROM_ENG(&limex->accept); + STATE_T accepts = AND_STATE(state, accept_mask); + + // Are we in an accept state? + if (ISZERO_STATE(accepts)) { + DEBUG_PRINTF("no accept states are on\n"); + return 0; + } + + SQUASH_UNTUG_BR_FN(limex, repeat_ctrl, repeat_state, offset, &accepts); + + DEBUG_PRINTF("looking for report %u\n", report); + + const struct NFAAccept *acceptTable = getAcceptTable(limex); + + CHUNK_T chunks[NUM_STATE_CHUNKS]; + memcpy(chunks, &accepts, sizeof(accepts)); + + CHUNK_T mask_chunks[NUM_STATE_CHUNKS]; + memcpy(mask_chunks, &accept_mask, sizeof(accept_mask)); + + u32 base_index = 0; // Cumulative sum of mask popcount up to current chunk. + for (u32 i = 0; i < NUM_STATE_CHUNKS; i++) { + CHUNK_T chunk = chunks[i]; + while (chunk != 0) { + u32 bit = FIND_AND_CLEAR_FN(&chunk); + u32 local_idx = RANK_IN_MASK_FN(mask_chunks[i], bit); + u32 idx = local_idx + base_index; + assert(idx < limex->acceptCount); + const struct NFAAccept *a = &acceptTable[idx]; + DEBUG_PRINTF("state %u is on, report list at %u\n", + bit + i * (u32)sizeof(chunk) * 8, a->reports); + + if (limexAcceptHasReport((const char *)limex, a, report)) { + DEBUG_PRINTF("report %u is on\n", report); + return 1; + } + } + base_index += POPCOUNT_FN(mask_chunks[i]); + } + + return 0; +} + +static really_inline +char LIMEX_INANYACCEPT_FN(const IMPL_NFA_T *limex, STATE_T state, + union RepeatControl *repeat_ctrl, char *repeat_state, + u64a offset) { + assert(limex); + + const STATE_T acceptMask = LOAD_FROM_ENG(&limex->accept); + STATE_T accstate = AND_STATE(state, acceptMask); + + // Are we in an accept state? + if (ISZERO_STATE(accstate)) { + DEBUG_PRINTF("no accept states are on\n"); + return 0; + } + + SQUASH_UNTUG_BR_FN(limex, repeat_ctrl, repeat_state, offset, &accstate); + + return ISNONZERO_STATE(accstate); +} + +#undef TESTEOD_FN +#undef REPORTCURRENT_FN +#undef EXPIRE_ESTATE_FN +#undef LIMEX_INACCEPT_FN +#undef LIMEX_INANYACCEPT_FN +#undef INITIAL_FN +#undef TOP_FN +#undef TOPN_FN +#undef CONTEXT_T +#undef IMPL_NFA_T +#undef ONES_STATE +#undef AND_STATE +#undef OR_STATE +#undef ANDNOT_STATE +#undef CLEARBIT_STATE +#undef TESTBIT_STATE +#undef ISNONZERO_STATE +#undef ISZERO_STATE +#undef PROCESS_ACCEPTS_IMPL_FN +#undef PROCESS_ACCEPTS_FN +#undef PROCESS_ACCEPTS_NOSQUASH_FN +#undef SQUASH_UNTUG_BR_FN +#undef GET_NFA_REPEAT_INFO_FN + +#undef CHUNK_T +#undef FIND_AND_CLEAR_FN +#undef POPCOUNT_FN +#undef RANK_IN_MASK_FN +#undef NUM_STATE_CHUNKS diff --git a/regex/nfa/limex_context.h b/regex/nfa/limex_context.h new file mode 100644 index 000000000..60d208793 --- /dev/null +++ b/regex/nfa/limex_context.h @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2015-2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Runtime context structures (NFAContext128 and friends) for the NFA. + */ + +#ifndef LIMEX_CONTEXT_H +#define LIMEX_CONTEXT_H + +#include "ue2common.h" +#include "callback.h" +#include "util/simd_utils.h" // for m128 etc + +// Runtime context structures. + +/* Note: The size of the context structures may vary from platform to platform + * (notably, for the Limex64 structure). As a result, information based on the + * size and other detail of these structures should not be written into the + * bytecode -- really, the details of the structure should not be accessed by + * the ue2 compile side at all. + */ +#ifdef __cplusplus +#error ue2 runtime only file +#endif + +/* cached_estate/esucc etc... + * + * If the exception state matches the cached_estate we will apply + * the or in the cached_esucc to the successor states rather than processing + * the exceptions. + * + * If the current exception state is a superset of the cached_estate, the + * cache is NOT used at all. + * + * The cache is updated when we see a different cacheable estate. + */ + +#define GEN_CONTEXT_STRUCT(nsize, ntype) \ +struct ALIGN_CL_DIRECTIVE NFAContext##nsize { \ + ntype s; /**< state bitvector (on entry/exit) */ \ + ntype local_succ; /**< used by exception handling for large models */ \ + ntype cached_estate; /* inited to 0 */ \ + ntype cached_esucc; \ + char cached_br; /**< cached_estate contains a br state */ \ + const ReportID *cached_reports; \ + union RepeatControl *repeat_ctrl; \ + char *repeat_state; \ + NfaCallback callback; \ + void *context; \ +}; + +GEN_CONTEXT_STRUCT(32, u32) +#ifdef ARCH_64_BIT +GEN_CONTEXT_STRUCT(64, u64a) +#else +GEN_CONTEXT_STRUCT(64, m128) +#endif +GEN_CONTEXT_STRUCT(128, m128) +GEN_CONTEXT_STRUCT(256, m256) +GEN_CONTEXT_STRUCT(384, m384) +GEN_CONTEXT_STRUCT(512, m512) + +#undef GEN_CONTEXT_STRUCT + +#endif diff --git a/regex/nfa/limex_exceptional.h b/regex/nfa/limex_exceptional.h new file mode 100644 index 000000000..6c7335f1b --- /dev/null +++ b/regex/nfa/limex_exceptional.h @@ -0,0 +1,401 @@ +/* + * Copyright (c) 2015-2020, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief LimEx NFA: runtime exception processing code. + * + * X-macro generic impl, included into the various LimEx model implementations. + */ + +#if !defined(SIZE) || !defined(STATE_T) || !defined(LOAD_FROM_ENG) +# error Must define SIZE, STATE_T, LOAD_FROM_ENG in includer. +#endif + +#include "config.h" +#include "limex_ring.h" +#include "util/join.h" +#include "util/uniform_ops.h" + +#define PE_FN JOIN(processExceptional, SIZE) +#define RUN_EXCEPTION_FN JOIN(runException, SIZE) +#define ZERO_STATE JOIN(zero_, STATE_T) +#define AND_STATE JOIN(and_, STATE_T) +#define EQ_STATE(a, b) (!JOIN(noteq_, STATE_T)((a), (b))) +#define OR_STATE JOIN(or_, STATE_T) +#define EXPAND_STATE JOIN(expand_, STATE_T) +#define SHUFFLE_BYTE_STATE JOIN(shuffle_byte_, STATE_T) +#define TESTBIT_STATE JOIN(testbit_, STATE_T) +#define EXCEPTION_T JOIN(struct NFAException, SIZE) +#define CONTEXT_T JOIN(NFAContext, SIZE) +#define IMPL_NFA_T JOIN(LimExNFA, SIZE) +#define GET_NFA_REPEAT_INFO_FN JOIN(getNfaRepeatInfo, SIZE) + +#ifdef ESTATE_ON_STACK +#define ESTATE_ARG STATE_T estate +#else +#define ESTATE_ARG const STATE_T *estatep +#define estate (*estatep) +#endif + +#ifdef STATE_ON_STACK +#define STATE_ARG_NAME s +#define STATE_ARG STATE_T STATE_ARG_NAME +#define STATE_ARG_P &s +#else +#define STATE_ARG_NAME sp +#define STATE_ARG const STATE_T *STATE_ARG_NAME +#define STATE_ARG_P sp +#endif + +#ifndef STATE_ON_STACK +#define BIG_MODEL +#endif + +#ifdef ARCH_64_BIT +#define CHUNK_T u64a +#define FIND_AND_CLEAR_FN findAndClearLSB_64 +#define POPCOUNT_FN popcount64 +#define RANK_IN_MASK_FN rank_in_mask64 +#else +#define CHUNK_T u32 +#define FIND_AND_CLEAR_FN findAndClearLSB_32 +#define POPCOUNT_FN popcount32 +#define RANK_IN_MASK_FN rank_in_mask32 +#endif + +/** \brief Process a single exception. Returns 1 if exception handling should + * continue, 0 if an accept callback has instructed us to halt. */ +static really_inline +int RUN_EXCEPTION_FN(const EXCEPTION_T *e, STATE_ARG, + STATE_T *succ, +#ifndef BIG_MODEL + STATE_T *local_succ, +#endif + const struct IMPL_NFA_T *limex, + u64a offset, + struct CONTEXT_T *ctx, + struct proto_cache *new_cache, + enum CacheResult *cacheable, + char in_rev, + const char flags) { + assert(e); + +#ifdef DEBUG_EXCEPTIONS + printf("EXCEPTION e=%p reports=%u trigger=", e, e->reports); + if (e->trigger == LIMEX_TRIGGER_NONE) { + printf("none"); + } else if (e->trigger == LIMEX_TRIGGER_POS) { + printf("pos"); + } else if (e->trigger == LIMEX_TRIGGER_TUG) { + printf("tug"); + } else { + printf("unknown!"); + } + printf("\n"); +#endif + + // Trigger exceptions, used in bounded repeats. + assert(!in_rev || e->trigger == LIMEX_TRIGGER_NONE); + if (!in_rev && e->trigger != LIMEX_TRIGGER_NONE) { + assert(e->repeatOffset != MO_INVALID_IDX); + const struct NFARepeatInfo *info = + (const struct NFARepeatInfo *)((const char *)limex + + e->repeatOffset); + const struct RepeatInfo *repeat = getRepeatInfo(info); + assert(ctx->repeat_ctrl && ctx->repeat_state); + union RepeatControl *repeat_ctrl = ctx->repeat_ctrl + info->ctrlIndex; + char *repeat_state = ctx->repeat_state + info->stateOffset; + + if (e->trigger == LIMEX_TRIGGER_POS) { + char cyclic_on = TESTBIT_STATE(*STATE_ARG_P, info->cyclicState); + processPosTrigger(repeat, repeat_ctrl, repeat_state, offset, + cyclic_on); + *cacheable = DO_NOT_CACHE_RESULT_AND_FLUSH_BR_ENTRIES; + } else { + assert(e->trigger == LIMEX_TRIGGER_TUG); + enum TriggerResult rv = + processTugTrigger(repeat, repeat_ctrl, repeat_state, offset); + if (rv == TRIGGER_FAIL) { + *cacheable = DO_NOT_CACHE_RESULT_AND_FLUSH_BR_ENTRIES; + DEBUG_PRINTF("tug found no valid matches in repeat state\n"); + return 1; // continue + } else if (rv == TRIGGER_STALE) { + *cacheable = DO_NOT_CACHE_RESULT_AND_FLUSH_BR_ENTRIES; + DEBUG_PRINTF("stale history, squashing cyclic state\n"); + assert(e->hasSquash == LIMEX_SQUASH_TUG); + *succ = AND_STATE(*succ, LOAD_FROM_ENG(&e->squash)); + return 1; // continue + } else if (rv == TRIGGER_SUCCESS_CACHE) { + new_cache->br = 1; + } else { + assert(rv == TRIGGER_SUCCESS); + *cacheable = DO_NOT_CACHE_RESULT_AND_FLUSH_BR_ENTRIES; + } + } + } + + // Some exceptions fire accepts. + if (e->reports != MO_INVALID_IDX) { + if (flags & CALLBACK_OUTPUT) { + const ReportID *reports = + (const ReportID *)((const char *)limex + e->reports); + if (unlikely(limexRunReports(reports, ctx->callback, + ctx->context, offset) + == MO_HALT_MATCHING)) { + DEBUG_PRINTF("callback instructed us to stop\n"); + return 0; // halt + } + if (*cacheable == CACHE_RESULT) { + if (!new_cache->reports || new_cache->reports == reports) { + new_cache->reports = reports; + } else { + *cacheable = DO_NOT_CACHE_RESULT; + } + } + } else { + if ((flags & FIRST_BYTE) && *cacheable == CACHE_RESULT) { + *cacheable = DO_NOT_CACHE_RESULT; + } /* otherwise we can cache as we never care about accepts */ + } + } + + // Most exceptions have a set of successors to switch on. `local_succ' is + // ORed into `succ' at the end of the caller's loop. +#ifndef BIG_MODEL + *local_succ = OR_STATE(*local_succ, LOAD_FROM_ENG(&e->successors)); +#else + ctx->local_succ = OR_STATE(ctx->local_succ, LOAD_FROM_ENG(&e->successors)); +#endif + + // Some exceptions squash states behind them. Note that we squash states in + // 'succ', not local_succ. + if (e->hasSquash == LIMEX_SQUASH_CYCLIC + || e->hasSquash == LIMEX_SQUASH_REPORT) { + *succ = AND_STATE(*succ, LOAD_FROM_ENG(&e->squash)); + if (*cacheable == CACHE_RESULT) { + *cacheable = DO_NOT_CACHE_RESULT; + } + } + + return 1; // continue +} + +#ifndef RUN_EXCEPTION_FN_ONLY + +/** \brief Process all of the exceptions associated with the states in the \a + * estate. */ +static really_inline +int PE_FN(STATE_ARG, ESTATE_ARG, UNUSED u32 diffmask, STATE_T *succ, + const struct IMPL_NFA_T *limex, const EXCEPTION_T *exceptions, + u64a offset, struct CONTEXT_T *ctx, char in_rev, char flags) { + assert(diffmask > 0); // guaranteed by caller macro + + if (EQ_STATE(estate, ctx->cached_estate)) { + DEBUG_PRINTF("using cached succ from previous state\n"); + *succ = OR_STATE(*succ, ctx->cached_esucc); + if (ctx->cached_reports && (flags & CALLBACK_OUTPUT)) { + DEBUG_PRINTF("firing cached reports from previous state\n"); + if (unlikely(limexRunReports(ctx->cached_reports, ctx->callback, + ctx->context, offset) + == MO_HALT_MATCHING)) { + return PE_RV_HALT; // halt; + } + } + return 0; + } + +#ifndef BIG_MODEL + STATE_T local_succ = ZERO_STATE; +#else + ctx->local_succ = ZERO_STATE; +#endif + + struct proto_cache new_cache = {0, NULL}; + enum CacheResult cacheable = CACHE_RESULT; + +#if defined(HAVE_AVX512VBMI) && SIZE > 64 + if (likely(limex->flags & LIMEX_FLAG_EXTRACT_EXP)) { + m512 emask = EXPAND_STATE(*STATE_ARG_P); + emask = SHUFFLE_BYTE_STATE(load_m512(&limex->exceptionShufMask), emask); + emask = and512(emask, load_m512(&limex->exceptionAndMask)); + u64a word = eq512mask(emask, load_m512(&limex->exceptionBitMask)); + + do { + u32 bit = FIND_AND_CLEAR_FN(&word); + const EXCEPTION_T *e = &exceptions[bit]; + + if (!RUN_EXCEPTION_FN(e, STATE_ARG_NAME, succ, +#ifndef BIG_MODEL + &local_succ, +#endif + limex, offset, ctx, &new_cache, &cacheable, + in_rev, flags)) { + return PE_RV_HALT; + } + } while (word); + } else { + // A copy of the estate as an array of GPR-sized chunks. + CHUNK_T chunks[sizeof(STATE_T) / sizeof(CHUNK_T)]; + CHUNK_T emask_chunks[sizeof(STATE_T) / sizeof(CHUNK_T)]; +#ifdef ESTATE_ON_STACK + memcpy(chunks, &estate, sizeof(STATE_T)); +#else + memcpy(chunks, estatep, sizeof(STATE_T)); +#endif + memcpy(emask_chunks, &limex->exceptionMask, sizeof(STATE_T)); + + u32 base_index[sizeof(STATE_T) / sizeof(CHUNK_T)]; + base_index[0] = 0; + for (s32 i = 0; i < (s32)ARRAY_LENGTH(base_index) - 1; i++) { + base_index[i + 1] = base_index[i] + POPCOUNT_FN(emask_chunks[i]); + } + + do { + u32 t = findAndClearLSB_32(&diffmask); +#ifdef ARCH_64_BIT + t >>= 1; // Due to diffmask64, which leaves holes in the bitmask. +#endif + assert(t < ARRAY_LENGTH(chunks)); + CHUNK_T word = chunks[t]; + assert(word != 0); + do { + u32 bit = FIND_AND_CLEAR_FN(&word); + u32 local_index = RANK_IN_MASK_FN(emask_chunks[t], bit); + u32 idx = local_index + base_index[t]; + const EXCEPTION_T *e = &exceptions[idx]; + + if (!RUN_EXCEPTION_FN(e, STATE_ARG_NAME, succ, +#ifndef BIG_MODEL + &local_succ, +#endif + limex, offset, ctx, &new_cache, &cacheable, + in_rev, flags)) { + return PE_RV_HALT; + } + } while (word); + } while (diffmask); + } +#else + // A copy of the estate as an array of GPR-sized chunks. + CHUNK_T chunks[sizeof(STATE_T) / sizeof(CHUNK_T)]; + CHUNK_T emask_chunks[sizeof(STATE_T) / sizeof(CHUNK_T)]; +#ifdef ESTATE_ON_STACK + memcpy(chunks, &estate, sizeof(STATE_T)); +#else + memcpy(chunks, estatep, sizeof(STATE_T)); +#endif + memcpy(emask_chunks, &limex->exceptionMask, sizeof(STATE_T)); + + u32 base_index[sizeof(STATE_T) / sizeof(CHUNK_T)]; + base_index[0] = 0; + for (s32 i = 0; i < (s32)ARRAY_LENGTH(base_index) - 1; i++) { + base_index[i + 1] = base_index[i] + POPCOUNT_FN(emask_chunks[i]); + } + + do { + u32 t = findAndClearLSB_32(&diffmask); +#ifdef ARCH_64_BIT + t >>= 1; // Due to diffmask64, which leaves holes in the bitmask. +#endif + assert(t < ARRAY_LENGTH(chunks)); + CHUNK_T word = chunks[t]; + assert(word != 0); + do { + u32 bit = FIND_AND_CLEAR_FN(&word); + u32 local_index = RANK_IN_MASK_FN(emask_chunks[t], bit); + u32 idx = local_index + base_index[t]; + const EXCEPTION_T *e = &exceptions[idx]; + + if (!RUN_EXCEPTION_FN(e, STATE_ARG_NAME, succ, +#ifndef BIG_MODEL + &local_succ, +#endif + limex, offset, ctx, &new_cache, &cacheable, + in_rev, flags)) { + return PE_RV_HALT; + } + } while (word); + } while (diffmask); +#endif + +#ifndef BIG_MODEL + *succ = OR_STATE(*succ, local_succ); +#else + *succ = OR_STATE(*succ, ctx->local_succ); +#endif + + if (cacheable == CACHE_RESULT) { + ctx->cached_estate = estate; +#ifndef BIG_MODEL + ctx->cached_esucc = local_succ; +#else + ctx->cached_esucc = ctx->local_succ; +#endif + ctx->cached_reports = new_cache.reports; + ctx->cached_br = new_cache.br; + } else if (cacheable == DO_NOT_CACHE_RESULT_AND_FLUSH_BR_ENTRIES) { + if (ctx->cached_br) { + ctx->cached_estate = ZERO_STATE; + } + } + + return 0; +} + +#endif + +#undef ZERO_STATE +#undef AND_STATE +#undef EQ_STATE +#undef OR_STATE +#undef EXPAND_STATE +#undef SHUFFLE_BYTE_STATE +#undef TESTBIT_STATE +#undef PE_FN +#undef RUN_EXCEPTION_FN +#undef CONTEXT_T +#undef EXCEPTION_T + +#ifdef estate +#undef estate +#endif + +#ifdef BIG_MODEL +#undef BIG_MODEL +#endif + +#undef STATE_ARG +#undef STATE_ARG_NAME +#undef STATE_ARG_P + +#undef IMPL_NFA_T + +#undef CHUNK_T +#undef FIND_AND_CLEAR_FN +#undef POPCOUNT_FN +#undef RANK_IN_MASK_FN diff --git a/regex/nfa/limex_internal.h b/regex/nfa/limex_internal.h new file mode 100644 index 000000000..23b1bd970 --- /dev/null +++ b/regex/nfa/limex_internal.h @@ -0,0 +1,203 @@ +/* + * Copyright (c) 2015-2020, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + This file provides the internal structures and definitions required for the + real NFAs (aka limex NFAs ); + + Limex NFAs now have variable length in memory. They look like this: + + LimExNFA structure + Fixed length, e.g. LimExNFA256. + Reachability table + Variable length array of state bitvectors, mapped into by + NFACommonXXX.reachMap. + Tops + Variable length array of state bitvectors, used for TOP_N events. + Acceleration structures + Variable length array of AccelAux structs. + Accepts + Variable length array of NFAAccept structs. + EOD Accepts + Variable length array of NFAAccept structs. + Exceptions + Variable length array of NFAExceptionXXX structs. + Repeat Structure Offsets + Array of u32 offsets that point at each "Repeat Structure" (below) + Repeat Structures + Variable length repeat structures, addressed via + NFAException32::repeatOffset etc. + + The state associated with the NFA is split into: + + -# The "traditional" NFA state as a bitvector. This is stored in the + first N bytes of the state space (length given in + NFACommonXXX.stateSize), and may be stored shrunk to CEIL(stateSize/8) + or compressed. If it is stored compressed, than the + LIMEX_FLAG_COMPRESS_STATE flag is set in NFACommonXXX.flags. + -# Extended NFA state, only used in some LimEx NFAs. This consists of a + variable length array of LimExNFAExtendedState structures, each with + pointers to a packed list of mmbit structures that follows them. Only + present when used. + + The value of NFA.stateSize gives the total state size in bytes (the sum of + all the above). + + Number of shifts should be always greater or equal to 1 + Number of shifts 0 means that no appropriate NFA engine was found. + +*/ + +#ifndef LIMEX_INTERNAL_H +#define LIMEX_INTERNAL_H + +#include "nfa_internal.h" +#include "repeat_internal.h" + +// Constants +#define MAX_SHIFT_COUNT 8 /**< largest number of shifts used by a LimEx NFA */ +#define MAX_SHIFT_AMOUNT 16 /**< largest shift amount used by a LimEx NFA */ + +#define LIMEX_FLAG_COMPRESS_STATE 1 /**< pack state into stream state */ +#define LIMEX_FLAG_COMPRESS_MASKED 2 /**< use reach mask-based compression */ +#define LIMEX_FLAG_CANNOT_DIE 4 /**< limex cannot have no states on */ +#define LIMEX_FLAG_EXTRACT_EXP 8 /**< use limex exception bit extraction */ + +enum LimExTrigger { + LIMEX_TRIGGER_NONE = 0, + LIMEX_TRIGGER_POS = 1, + LIMEX_TRIGGER_TUG = 2 +}; + +enum LimExSquash { + LIMEX_SQUASH_NONE = 0, //!< no squash for you! + LIMEX_SQUASH_CYCLIC = 1, //!< squash due to cyclic state + LIMEX_SQUASH_TUG = 2, //!< squash due to tug trigger with stale estate + LIMEX_SQUASH_REPORT = 3 //!< squash when report is raised +}; + +/* uniform looking types for the macros */ +typedef u8 u_8; +typedef u16 u_16; +typedef u32 u_32; +typedef u64a u_64; +typedef m128 u_128; +typedef m256 u_256; +typedef m384 u_384; +typedef m512 u_512; + +#define CREATE_NFA_LIMEX(size) \ +struct NFAException##size { \ + u_##size squash; /**< mask of states to leave on */ \ + u_##size successors; /**< mask of states to switch on */ \ + u32 reports; /**< offset to start of reports list, or MO_INVALID_IDX */ \ + u32 repeatOffset; /**< offset to NFARepeatInfo, or MO_INVALID_IDX */ \ + u8 hasSquash; /**< from enum LimExSquash */ \ + u8 trigger; /**< from enum LimExTrigger */ \ +}; \ + \ +struct LimExNFA##size { \ + u8 reachMap[N_CHARS]; /**< map of char -> entry in reach[] */ \ + u32 reachSize; /**< number of reach masks */ \ + u32 accelCount; /**< number of entries in accel table */ \ + u32 accelTableOffset; /* rel. to start of LimExNFA */ \ + u32 accelAuxCount; /**< number of entries in aux table */ \ + u32 accelAuxOffset; /* rel. to start of LimExNFA */ \ + u32 acceptCount; \ + u32 acceptOffset; /* rel. to start of LimExNFA */ \ + u32 acceptEodCount; \ + u32 acceptEodOffset; /* rel. to start of LimExNFA */ \ + u32 exceptionCount; \ + u32 exceptionOffset; /* rel. to start of LimExNFA */ \ + u32 repeatCount; \ + u32 repeatOffset; \ + u32 squashOffset; /* rel. to start of LimExNFA; for accept squashing */ \ + u32 squashCount; \ + u32 topCount; \ + u32 topOffset; /* rel. to start of LimExNFA */ \ + u32 stateSize; /**< not including extended history */ \ + u32 flags; \ + u_##size init; \ + u_##size initDS; \ + u_##size accept; /**< mask of accept states */ \ + u_##size acceptAtEOD; /**< mask of states that accept at EOD */ \ + u_##size accel; /**< mask of accelerable states */ \ + u_##size accelPermute; /**< pshufb permute mask (not GPR) */ \ + u_##size accelCompare; /**< pshufb compare mask (not GPR) */ \ + u_##size accel_and_friends; /**< mask of accelerable states + likely + * followers */ \ + u_##size compressMask; /**< switch off before compress */ \ + u_##size exceptionMask; \ + u_##size repeatCyclicMask; /**< also includes tug states */ \ + u_##size zombieMask; /**< zombie if in any of the set states */ \ + u_##size shift[MAX_SHIFT_COUNT]; \ + u32 shiftCount; /**< number of shift masks used */ \ + u8 shiftAmount[MAX_SHIFT_COUNT]; /**< shift amount for each mask */ \ + m512 exceptionShufMask; /**< exception byte shuffle mask */ \ + m512 exceptionBitMask; /**< exception bit mask */ \ + m512 exceptionAndMask; /**< exception and mask */ \ +}; + +CREATE_NFA_LIMEX(32) +CREATE_NFA_LIMEX(64) +CREATE_NFA_LIMEX(128) +CREATE_NFA_LIMEX(256) +CREATE_NFA_LIMEX(384) +CREATE_NFA_LIMEX(512) + +/** \brief Structure describing a bounded repeat within the LimEx NFA. + * + * This struct is followed in memory by: + * + * -# a RepeatInfo structure + * -# a variable-sized lookup table for REPEAT_SPARSE_OPTIMAL_P repeats + * -# a TUG mask + */ +struct NFARepeatInfo { + u32 cyclicState; //!< index of this repeat's cyclic state + u32 ctrlIndex; //!< index of this repeat's control block + u32 packedCtrlOffset; //!< offset to packed control block in stream state + u32 stateOffset; //!< offset to repeat state in stream state + u32 stateSize; //!< total size of packed stream state for this repeat + u32 tugMaskOffset; //!< offset to tug mask (rel. to NFARepeatInfo) +}; + +struct NFAAccept { + u8 single_report; //!< If true, 'reports' is report id. + + /** + * \brief If single report is true, this is the report id to fire. + * Otherwise, it is the offset (relative to the start of the LimExNFA + * structure) of a list of reports, terminated with MO_INVALID_IDX. + */ + u32 reports; + + u32 squash; //!< Offset (from LimEx) into squash masks, or MO_INVALID_IDX. +}; + +#endif diff --git a/regex/nfa/limex_limits.h b/regex/nfa/limex_limits.h new file mode 100644 index 000000000..f4df54a4b --- /dev/null +++ b/regex/nfa/limex_limits.h @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2015-2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef LIMEX_LIMITS_H +#define LIMEX_LIMITS_H + +#define NFA_MAX_STATES 512 /**< max states in an NFA */ +#define NFA_MAX_ACCEL_STATES 8 /**< max accel states in a NFA */ + +#endif diff --git a/regex/nfa/limex_native.c b/regex/nfa/limex_native.c new file mode 100644 index 000000000..f6f5809c3 --- /dev/null +++ b/regex/nfa/limex_native.c @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2015-2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief LimEx NFA: native GPR runtime implementations. + */ + +//#define DEBUG +//#define DEBUG_INPUT +//#define DEBUG_EXCEPTIONS + +#include "limex.h" + +#include "accel.h" +#include "limex_internal.h" +#include "nfa_internal.h" +#include "ue2common.h" +#include "util/bitutils.h" + +// Common code +#define STATE_ON_STACK +#define ESTATE_ON_STACK + +#include "limex_runtime.h" + +// Other implementation code from X-Macro impl. +#define SIZE 32 +#define STATE_T u32 +#define ENG_STATE_T u32 +#define LOAD_FROM_ENG load_u32 + +#include "limex_state_impl.h" + +#define INLINE_ATTR really_inline +#include "limex_common_impl.h" + +//////////////////////////////////////////////////////////////////////////// +// LimEx NFA implementation code - general purpose registers +//////////////////////////////////////////////////////////////////////////// + +// Process exceptional states + +#define STATE_ON_STACK +#define ESTATE_ON_STACK +#define RUN_EXCEPTION_FN_ONLY +#include "limex_exceptional.h" + +static really_inline +int processExceptional32(u32 s, u32 estate, UNUSED u32 diffmask, u32 *succ, + const struct LimExNFA32 *limex, + const struct NFAException32 *exceptions, u64a offset, + struct NFAContext32 *ctx, char in_rev, char flags) { + assert(estate != 0); // guaranteed by calling macro + + if (estate == ctx->cached_estate) { + DEBUG_PRINTF("using cached succ from previous state\n"); + *succ |= ctx->cached_esucc; + if (ctx->cached_reports && (flags & CALLBACK_OUTPUT)) { + DEBUG_PRINTF("firing cached reports from previous state\n"); + if (unlikely(limexRunReports(ctx->cached_reports, ctx->callback, + ctx->context, offset) + == MO_HALT_MATCHING)) { + return PE_RV_HALT; // halt; + } + } + return 0; + } + + u32 orig_estate = estate; // for caching + u32 local_succ = 0; + struct proto_cache new_cache = {0, NULL}; + enum CacheResult cacheable = CACHE_RESULT; + + /* Note that only exception-states that consist of exceptions that _only_ + * set successors (not fire accepts or squash states) are cacheable. */ + + do { + u32 bit = findAndClearLSB_32(&estate); + u32 idx = rank_in_mask32(limex->exceptionMask, bit); + const struct NFAException32 *e = &exceptions[idx]; + if (!runException32(e, s, succ, &local_succ, limex, offset, ctx, + &new_cache, &cacheable, in_rev, flags)) { + return PE_RV_HALT; + } + } while (estate != 0); + + *succ |= local_succ; + + if (cacheable == CACHE_RESULT) { + ctx->cached_estate = orig_estate; + ctx->cached_esucc = local_succ; + ctx->cached_reports = new_cache.reports; + ctx->cached_br = new_cache.br; + } else if (cacheable == DO_NOT_CACHE_RESULT_AND_FLUSH_BR_ENTRIES) { + if (ctx->cached_br) { + ctx->cached_estate = 0U; + } + } + + return 0; +} + +// 32-bit models. +#include "limex_runtime_impl.h" diff --git a/regex/nfa/limex_ring.h b/regex/nfa/limex_ring.h new file mode 100644 index 000000000..522cfa12b --- /dev/null +++ b/regex/nfa/limex_ring.h @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2015, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Bounded Repeat implementation for the LimEx NFA. + */ + +#ifndef LIMEX_RING_H +#define LIMEX_RING_H + +#include "ue2common.h" +#include "repeat.h" + +#ifdef __cplusplus +extern "C" +{ +#endif + +/** \brief Return values from \ref processTugTrigger, used to provide feedback + * about a bounded repeat to the caller. + * + * TRIGGER_FAIL does not get cached as we prefer to use TRIGGER_STALE which + * allows the exception to squash the cyclic state as well. */ +enum TriggerResult { + TRIGGER_FAIL, /**< no valid matches, but history still valid */ + TRIGGER_SUCCESS, /**< valid match found */ + TRIGGER_STALE, /**< no valid matches and history is invalid (stale) */ + TRIGGER_SUCCESS_CACHE /**< valid match found; can cache as the repeat has no + upper bound. */ +}; + +/** \brief Handle a TUG trigger: given an \p offset, returns whether a repeat + * matches or not. */ +static really_inline +enum TriggerResult processTugTrigger(const struct RepeatInfo *info, + const union RepeatControl *ctrl, + const char *state, u64a offset) { + DEBUG_PRINTF("tug trigger, %s history, repeat={%u,%u}, offset=%llu, " + "ctrl=%p, state=%p\n", + repeatTypeName(info->type), info->repeatMin, info->repeatMax, + offset, ctrl, state); + + assert(ISALIGNED(ctrl)); + + enum RepeatMatch rv = repeatHasMatch(info, ctrl, state, offset); + switch (rv) { + case REPEAT_NOMATCH: + return TRIGGER_FAIL; + case REPEAT_STALE: + return TRIGGER_STALE; + case REPEAT_MATCH: + if (info->repeatMax == REPEAT_INF) { + // {N,} repeats can be cached. + return TRIGGER_SUCCESS_CACHE; + } else { + return TRIGGER_SUCCESS; + } + } + + assert(0); // unreachable + return TRIGGER_FAIL; +} + +/** \brief Handle a POS trigger: stores a top in the repeat. */ +static really_inline +void processPosTrigger(const struct RepeatInfo *info, union RepeatControl *ctrl, + char *state, u64a offset, char is_alive) { + DEBUG_PRINTF("pos trigger, %s history, repeat={%u,%u}, offset=%llu, " + "is_alive=%d\n", repeatTypeName(info->type), + info->repeatMin, info->repeatMax, offset, is_alive); + + assert(ISALIGNED(ctrl)); + + repeatStore(info, ctrl, state, offset, is_alive); +} + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/regex/nfa/limex_runtime.h b/regex/nfa/limex_runtime.h new file mode 100644 index 000000000..6109d382d --- /dev/null +++ b/regex/nfa/limex_runtime.h @@ -0,0 +1,201 @@ +/* + * Copyright (c) 2015-2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + \brief Limex Execution Engine Or: + How I Learned To Stop Worrying And Love The Preprocessor + + This file includes utility functions which do not depend on the size of the + state or shift masks directly. +*/ + +#ifndef LIMEX_RUNTIME_H +#define LIMEX_RUNTIME_H + +#include "limex_accel.h" +#include "limex_context.h" +#include "limex_internal.h" +#include "nfa_api_util.h" +#include "nfa_internal.h" +#include "util/uniform_ops.h" + +//////////////////////////////////////////////////////////////////////////// +// LimEx NFA implementation code - common macros +//////////////////////////////////////////////////////////////////////////// + +#ifdef DEBUG_INPUT +#include +#define DUMP_INPUT(index) DEBUG_PRINTF("input %p i=%zu: %02hhx (%c)\n", \ + &input[index], index, input[index], \ + isprint(input[index]) ? input[index] : ' ') +#else +#define DUMP_INPUT(index) do { } while(0) +#endif + +#define NO_OUTPUT 0 +#define CALLBACK_OUTPUT 1 +#define FIRST_BYTE 16 + +enum CacheResult { + DO_NOT_CACHE_RESULT, + CACHE_RESULT, + DO_NOT_CACHE_RESULT_AND_FLUSH_BR_ENTRIES +}; + +struct proto_cache { + char br; + const ReportID *reports; +}; + +#define PE_RV_HALT 1 + +#ifdef STATE_ON_STACK +#define pass_state s +#else +#define pass_state &s +#endif + +#ifdef ESTATE_ON_STACK +#define pass_estate estate +#else +#define pass_estate &estate +#endif + +static really_inline +int limexRunReports(const ReportID *reports, NfaCallback callback, + void *context, u64a offset) { + assert(reports); + assert(callback); + + for (; *reports != MO_INVALID_IDX; ++reports) { + DEBUG_PRINTF("firing report for id %u at offset %llu\n", + *reports, offset); + int rv = callback(0, offset, *reports, context); + if (rv == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } + return MO_CONTINUE_MATCHING; // continue +} + +static really_inline +int limexRunAccept(const char *limex_base, const struct NFAAccept *accept, + NfaCallback callback, void *context, u64a offset) { + if (accept->single_report) { + const ReportID report = accept->reports; + DEBUG_PRINTF("firing single report for id %u at offset %llu\n", report, + offset); + return callback(0, offset, report, context); + } + const ReportID *reports = (const ReportID *)(limex_base + accept->reports); + return limexRunReports(reports, callback, context, offset); +} + +static really_inline +int limexAcceptHasReport(const char *limex_base, const struct NFAAccept *accept, + ReportID report) { + if (accept->single_report) { + return accept->reports == report; + } + + const ReportID *reports = (const ReportID *)(limex_base + accept->reports); + assert(*reports != MO_INVALID_IDX); + do { + if (*reports == report) { + return 1; + } + reports++; + } while (*reports != MO_INVALID_IDX); + + return 0; +} + +/** \brief Return a (correctly typed) pointer to the exception table. */ +#define getExceptionTable(exc_type, lim) \ + ((const exc_type *)((const char *)(lim) + (lim)->exceptionOffset)) + +/** \brief Return a pointer to the ordinary accepts table. */ +#define getAcceptTable(lim) \ + ((const struct NFAAccept *)((const char *)(lim) + (lim)->acceptOffset)) + +/** \brief Return a pointer to the EOD accepts table. */ +#define getAcceptEodTable(lim) \ + ((const struct NFAAccept *)((const char *)(lim) + (lim)->acceptEodOffset)) + +#define MAKE_GET_NFA_REPEAT_INFO(size) \ + static really_inline const struct NFARepeatInfo *getNfaRepeatInfo##size( \ + const struct LimExNFA##size *limex, unsigned num) { \ + assert(num < limex->repeatCount); \ + \ + const char *base = (const char *)limex; \ + const u32 *repeatOffset = (const u32 *)(base + limex->repeatOffset); \ + assert(ISALIGNED(repeatOffset)); \ + \ + const struct NFARepeatInfo *info = \ + (const struct NFARepeatInfo *)(base + repeatOffset[num]); \ + assert(ISALIGNED(info)); \ + return info; \ + } + +MAKE_GET_NFA_REPEAT_INFO(32) +MAKE_GET_NFA_REPEAT_INFO(64) +MAKE_GET_NFA_REPEAT_INFO(128) +MAKE_GET_NFA_REPEAT_INFO(256) +MAKE_GET_NFA_REPEAT_INFO(384) +MAKE_GET_NFA_REPEAT_INFO(512) + +static really_inline +const struct RepeatInfo *getRepeatInfo(const struct NFARepeatInfo *info) { + const struct RepeatInfo *repeat = + (const struct RepeatInfo *)((const char *)info + sizeof(*info)); + assert(ISALIGNED(repeat)); + return repeat; +} + +static really_inline +union RepeatControl *getRepeatControlBase(char *state, size_t nfa_state_size) { + union RepeatControl *ctrl_base = + (union RepeatControl *)(state + + ROUNDUP_N(nfa_state_size, + alignof(union RepeatControl))); + assert(ISALIGNED(ctrl_base)); + return ctrl_base; +} + +static really_inline +const union RepeatControl *getRepeatControlBaseConst(const char *state, + size_t nfa_state_size) { + const union RepeatControl *ctrl_base = + (const union RepeatControl *)(state + + ROUNDUP_N(nfa_state_size, + alignof(union RepeatControl))); + assert(ISALIGNED(ctrl_base)); + return ctrl_base; +} + +#endif diff --git a/regex/nfa/limex_runtime_impl.h b/regex/nfa/limex_runtime_impl.h new file mode 100644 index 000000000..3b3bc5013 --- /dev/null +++ b/regex/nfa/limex_runtime_impl.h @@ -0,0 +1,1079 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "util/join.h" +#ifndef __KERNEL__ +#include +#else +#include +#endif + +/** \file + * \brief Limex Execution Engine Or: + * How I Learned To Stop Worrying And Love The Preprocessor + * + * Version 2.0: now with X-Macros, so you get line numbers in your debugger. + */ + + +#if !defined(SIZE) || !defined(STATE_T) || !defined(LOAD_FROM_ENG) +# error Must define SIZE, STATE_T, LOAD_FROM_ENG in includer. +#endif + +#define LIMEX_API_ROOT JOIN(nfaExecLimEx, SIZE) + +#define IMPL_NFA_T JOIN(struct LimExNFA, SIZE) + +#define TESTEOD_FN JOIN(moNfaTestEod, SIZE) +#define INITIAL_FN JOIN(moNfaInitial, SIZE) +#define TOP_FN JOIN(moNfaTop, SIZE) +#define TOPN_FN JOIN(moNfaTopN, SIZE) +#define REPORTCURRENT_FN JOIN(moNfaReportCurrent, SIZE) +#define COMPRESS_FN JOIN(moNfaCompressState, SIZE) +#define EXPAND_FN JOIN(moNfaExpandState, SIZE) +#define COMPRESS_REPEATS_FN JOIN(LIMEX_API_ROOT, _Compress_Repeats) +#define EXPAND_REPEATS_FN JOIN(LIMEX_API_ROOT, _Expand_Repeats) +#define PROCESS_ACCEPTS_FN JOIN(moProcessAccepts, SIZE) +#define PROCESS_ACCEPTS_NOSQUASH_FN JOIN(moProcessAcceptsNoSquash, SIZE) +#define GET_NFA_REPEAT_INFO_FN JOIN(getNfaRepeatInfo, SIZE) +#define RUN_ACCEL_FN JOIN(LIMEX_API_ROOT, _Run_Accel) +#define RUN_EXCEPTIONS_FN JOIN(LIMEX_API_ROOT, _Run_Exceptions) +#define REV_STREAM_FN JOIN(LIMEX_API_ROOT, _Rev_Stream) +#define LOOP_NOACCEL_FN JOIN(LIMEX_API_ROOT, _Loop_No_Accel) +#define STREAM_FN JOIN(LIMEX_API_ROOT, _Stream) +#define STREAMCB_FN JOIN(LIMEX_API_ROOT, _Stream_CB) +#define STREAMFIRST_FN JOIN(LIMEX_API_ROOT, _Stream_First) +#define STREAMSILENT_FN JOIN(LIMEX_API_ROOT, _Stream_Silent) +#define CONTEXT_T JOIN(NFAContext, SIZE) +#define EXCEPTION_T JOIN(struct NFAException, SIZE) +#define AND_STATE JOIN(and_, STATE_T) +#define ANDNOT_STATE JOIN(andnot_, STATE_T) +#define OR_STATE JOIN(or_, STATE_T) +#define LSHIFT_STATE JOIN(lshift_, STATE_T) +#define TESTBIT_STATE JOIN(testbit_, STATE_T) +#define CLEARBIT_STATE JOIN(clearbit_, STATE_T) +#define ZERO_STATE JOIN(zero_, STATE_T) +#define ISNONZERO_STATE JOIN(isNonZero_, STATE_T) +#define ISZERO_STATE JOIN(isZero_, STATE_T) +#define NOTEQ_STATE JOIN(noteq_, STATE_T) + +// Pick an appropriate diffrich function for this platform. +#ifdef ARCH_64_BIT +#define DIFFRICH_STATE JOIN(diffrich64_, STATE_T) +#else +#define DIFFRICH_STATE JOIN(diffrich_, STATE_T) +#endif + +#define EXPIRE_ESTATE_FN JOIN(limexExpireExtendedState, SIZE) +#define SQUASH_UNTUG_BR_FN JOIN(lazyTug, SIZE) + +// Acceleration and exception masks: we load them on the fly for really big +// models. +#if SIZE < 256 +#define ACCEL_MASK accelMask +#define ACCEL_AND_FRIENDS_MASK accel_and_friendsMask +#define EXCEPTION_MASK exceptionMask +#else +#define ACCEL_MASK LOAD_FROM_ENG(&limex->accel) +#define ACCEL_AND_FRIENDS_MASK LOAD_FROM_ENG(&limex->accel_and_friends) +#define EXCEPTION_MASK LOAD_FROM_ENG(&limex->exceptionMask) +#endif + +// Run exception processing, if necessary. Returns 0 if scanning should +// continue, 1 if an accept was fired and the user instructed us to halt. +static really_inline +char RUN_EXCEPTIONS_FN(const IMPL_NFA_T *limex, const EXCEPTION_T *exceptions, + STATE_T s, const STATE_T emask, size_t i, u64a offset, + STATE_T *succ, u64a *final_loc, struct CONTEXT_T *ctx, + const char flags, const char in_rev, + const char first_match) { + STATE_T estate = AND_STATE(s, emask); + u32 diffmask = DIFFRICH_STATE(ZERO_STATE, estate); + if (likely(!diffmask)) { + return 0; // No exceptions to process. + } + + if (first_match && i) { + STATE_T acceptMask = LOAD_FROM_ENG(&limex->accept); + STATE_T foundAccepts = AND_STATE(s, acceptMask); + if (unlikely(ISNONZERO_STATE(foundAccepts))) { + DEBUG_PRINTF("first match at %zu\n", i); + DEBUG_PRINTF("for nfa %p\n", limex); + assert(final_loc); + ctx->s = s; + *final_loc = i; + return 1; // Halt matching. + } + } + + u64a callback_offset = i + offset; + char localflags = (!i && !in_rev) ? NO_OUTPUT | FIRST_BYTE : flags; + + int rv = JOIN(processExceptional, SIZE)( + pass_state, pass_estate, diffmask, succ, limex, exceptions, + callback_offset, ctx, in_rev, localflags); + if (rv == PE_RV_HALT) { + return 1; // Halt matching. + } + + return 0; +} + +static really_inline +size_t RUN_ACCEL_FN(const STATE_T s, UNUSED const STATE_T accelMask, + UNUSED const IMPL_NFA_T *limex, const u8 *accelTable, + const union AccelAux *accelAux, const u8 *input, size_t i, + size_t length) { + size_t j; +#if SIZE < 128 + // For small cases, we pass the state by value. + j = JOIN(doAccel, SIZE)(s, accelMask, accelTable, accelAux, input, i, + length); +#else + j = JOIN(doAccel, SIZE)(&s, limex, accelTable, accelAux, input, i, length); +#endif + + assert(j >= i); + assert(i <= length); + return j; +} + +// Shift macros for Limited NFAs. Defined in terms of uniform ops. +// LimExNFAxxx ptr in 'limex' and the current state in 's' +#define NFA_EXEC_LIM_SHIFT(limex_m, curr_m, shift_idx) \ + LSHIFT_STATE(AND_STATE(curr_m, LOAD_FROM_ENG(&limex_m->shift[shift_idx])), \ + limex_m->shiftAmount[shift_idx]) + +// Calculate the (limited model) successors for a number of variable shifts. +// Assumes current state in 'curr_m' and places the successors in 'succ_m'. +#define NFA_EXEC_GET_LIM_SUCC(limex_m, curr_m, succ_m) \ + do { \ + succ_m = NFA_EXEC_LIM_SHIFT(limex_m, curr_m, 0); \ + switch (limex_m->shiftCount) { \ + case 8: \ + succ_m = OR_STATE(succ_m, NFA_EXEC_LIM_SHIFT(limex_m, curr_m, 7)); \ + FALLTHROUGH; \ + case 7: \ + succ_m = OR_STATE(succ_m, NFA_EXEC_LIM_SHIFT(limex_m, curr_m, 6)); \ + FALLTHROUGH; \ + case 6: \ + succ_m = OR_STATE(succ_m, NFA_EXEC_LIM_SHIFT(limex_m, curr_m, 5)); \ + FALLTHROUGH; \ + case 5: \ + succ_m = OR_STATE(succ_m, NFA_EXEC_LIM_SHIFT(limex_m, curr_m, 4)); \ + FALLTHROUGH; \ + case 4: \ + succ_m = OR_STATE(succ_m, NFA_EXEC_LIM_SHIFT(limex_m, curr_m, 3)); \ + FALLTHROUGH; \ + case 3: \ + succ_m = OR_STATE(succ_m, NFA_EXEC_LIM_SHIFT(limex_m, curr_m, 2)); \ + FALLTHROUGH; \ + case 2: \ + succ_m = OR_STATE(succ_m, NFA_EXEC_LIM_SHIFT(limex_m, curr_m, 1)); \ + FALLTHROUGH; \ + case 1: \ + FALLTHROUGH; \ + case 0: \ + ; \ + } \ + } while (0) + +/** + * \brief LimEx NFAS inner loop without accel. + * + * Note that the "all zeroes" early death check is only performed if can_die is + * true. + * + */ +static really_inline +char LOOP_NOACCEL_FN(const IMPL_NFA_T *limex, const u8 *input, size_t *loc, + size_t length, STATE_T *s_ptr, struct CONTEXT_T *ctx, + u64a offset, const char flags, u64a *final_loc, + const char first_match, const char can_die) { + const ENG_STATE_T *reach = get_reach_table(limex); +#if SIZE < 256 + const STATE_T exceptionMask = LOAD_FROM_ENG(&limex->exceptionMask); +#endif + const EXCEPTION_T *exceptions = getExceptionTable(EXCEPTION_T, limex); + STATE_T s = *s_ptr; + + size_t i = *loc; + for (; i != length; i++) { + DUMP_INPUT(i); + if (can_die && ISZERO_STATE(s)) { + DEBUG_PRINTF("no states are switched on, early exit\n"); + break; + } + + STATE_T succ; + NFA_EXEC_GET_LIM_SUCC(limex, s, succ); + + if (RUN_EXCEPTIONS_FN(limex, exceptions, s, EXCEPTION_MASK, i, offset, + &succ, final_loc, ctx, flags, 0, first_match)) { + return MO_HALT_MATCHING; + } + + u8 c = input[i]; + s = AND_STATE(succ, LOAD_FROM_ENG(&reach[limex->reachMap[c]])); + } + + *loc = i; + *s_ptr = s; + return MO_CONTINUE_MATCHING; +} + +static really_inline +char STREAM_FN(const IMPL_NFA_T *limex, const u8 *input, size_t length, + struct CONTEXT_T *ctx, u64a offset, const char flags, + u64a *final_loc, const char first_match) { + const ENG_STATE_T *reach = get_reach_table(limex); +#if SIZE < 256 + const STATE_T accelMask = LOAD_FROM_ENG(&limex->accel); + const STATE_T accel_and_friendsMask + = LOAD_FROM_ENG(&limex->accel_and_friends); + const STATE_T exceptionMask = LOAD_FROM_ENG(&limex->exceptionMask); +#endif + const u8 *accelTable = + (const u8 *)((const char *)limex + limex->accelTableOffset); + const union AccelAux *accelAux = + (const union AccelAux *)((const char *)limex + limex->accelAuxOffset); + const EXCEPTION_T *exceptions = getExceptionTable(EXCEPTION_T, limex); + STATE_T s = ctx->s; + + /* assert(ISALIGNED_16(exceptions)); */ + /* assert(ISALIGNED_16(reach)); */ + + size_t i = 0; + size_t min_accel_offset = 0; + if (!limex->accelCount || length < ACCEL_MIN_LEN) { + min_accel_offset = length; + goto without_accel; + } else { + goto with_accel; + } + +without_accel: + if (limex->flags & LIMEX_FLAG_CANNOT_DIE) { + const char can_die = 0; + if (LOOP_NOACCEL_FN(limex, input, &i, min_accel_offset, &s, ctx, offset, + flags, final_loc, first_match, + can_die) == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } else { + const char can_die = 1; + if (LOOP_NOACCEL_FN(limex, input, &i, min_accel_offset, &s, ctx, offset, + flags, final_loc, first_match, + can_die) == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } + +with_accel: + for (; i != length; i++) { + DUMP_INPUT(i); + if (i + 16 <= length && + ISZERO_STATE(ANDNOT_STATE(ACCEL_AND_FRIENDS_MASK, s))) { + DEBUG_PRINTF("current states are all accelerable\n"); + assert(i + 16 <= length); + size_t post_idx = + RUN_ACCEL_FN(s, ACCEL_MASK, limex, accelTable, accelAux, input, + i, length); + if (post_idx != i) { + /* squashing any friends as they may no longer be valid; + * offset back off should ensure they weren't doing anything + * important */ + s = AND_STATE(ACCEL_MASK, s); + } + + if (i && post_idx < min_accel_offset + BAD_ACCEL_DIST) { + min_accel_offset = post_idx + BIG_ACCEL_PENALTY; + } else { + min_accel_offset = post_idx + SMALL_ACCEL_PENALTY; + } + + if (min_accel_offset >= length - ACCEL_MIN_LEN) { + min_accel_offset = length; + } + + DEBUG_PRINTF("advanced %zd, next accel chance in %zd/%zd\n", + post_idx - i, min_accel_offset - post_idx, + length - post_idx); + + i = post_idx; + if (i == length) { + break; /* all chars eaten, break out of loop */ + } + goto without_accel; + } + + STATE_T succ; + NFA_EXEC_GET_LIM_SUCC(limex, s, succ); + + if (RUN_EXCEPTIONS_FN(limex, exceptions, s, EXCEPTION_MASK, i, offset, + &succ, final_loc, ctx, flags, 0, first_match)) { + return MO_HALT_MATCHING; + } + + u8 c = input[i]; + s = AND_STATE(succ, LOAD_FROM_ENG(&reach[limex->reachMap[c]])); + } + + ctx->s = s; + + if ((first_match || (flags & CALLBACK_OUTPUT)) && limex->acceptCount) { + STATE_T acceptMask = LOAD_FROM_ENG(&limex->accept); + const struct NFAAccept *acceptTable = getAcceptTable(limex); + STATE_T foundAccepts = AND_STATE(s, acceptMask); + if (unlikely(ISNONZERO_STATE(foundAccepts))) { + if (first_match) { + ctx->s = s; + assert(final_loc); + *final_loc = length; + return MO_HALT_MATCHING; + } else if (PROCESS_ACCEPTS_FN(limex, &ctx->s, &acceptMask, + acceptTable, offset + length, + ctx->callback, ctx->context)) { + return MO_HALT_MATCHING; + } + } + } + if (first_match) { + assert(final_loc); + *final_loc = length; + } + return MO_CONTINUE_MATCHING; +} + +static never_inline +char REV_STREAM_FN(const IMPL_NFA_T *limex, const u8 *input, size_t length, + struct CONTEXT_T *ctx, u64a offset) { + const ENG_STATE_T *reach = get_reach_table(limex); +#if SIZE < 256 + const STATE_T exceptionMask = LOAD_FROM_ENG(&limex->exceptionMask); +#endif + const EXCEPTION_T *exceptions = getExceptionTable(EXCEPTION_T, limex); + STATE_T s = ctx->s; + + /* assert(ISALIGNED_16(exceptions)); */ + /* assert(ISALIGNED_16(reach)); */ + const char flags = CALLBACK_OUTPUT; + u64a *final_loc = NULL; + + for (size_t i = length; i != 0; i--) { + DUMP_INPUT(i - 1); + if (ISZERO_STATE(s)) { + DEBUG_PRINTF("no states are switched on, early exit\n"); + ctx->s = s; + return MO_CONTINUE_MATCHING; + } + + STATE_T succ; + NFA_EXEC_GET_LIM_SUCC(limex, s, succ); + + if (RUN_EXCEPTIONS_FN(limex, exceptions, s, EXCEPTION_MASK, i, offset, + &succ, final_loc, ctx, flags, 1, 0)) { + return MO_HALT_MATCHING; + } + + u8 c = input[i - 1]; + s = AND_STATE(succ, LOAD_FROM_ENG(&reach[limex->reachMap[c]])); + } + + ctx->s = s; + + STATE_T acceptMask = LOAD_FROM_ENG(&limex->accept); + const struct NFAAccept *acceptTable = getAcceptTable(limex); + const u32 acceptCount = limex->acceptCount; + assert(flags & CALLBACK_OUTPUT); + if (acceptCount) { + STATE_T foundAccepts = AND_STATE(s, acceptMask); + if (unlikely(ISNONZERO_STATE(foundAccepts))) { + if (PROCESS_ACCEPTS_NOSQUASH_FN(limex, &ctx->s, &acceptMask, + acceptTable, offset, ctx->callback, + ctx->context)) { + return MO_HALT_MATCHING; + } + } + } + return MO_CONTINUE_MATCHING; +} + +static really_inline +void COMPRESS_REPEATS_FN(const IMPL_NFA_T *limex, void *dest, void *src, + u64a offset) { + if (!limex->repeatCount) { + return; + } + + STATE_T s = *(STATE_T *)src; + + if (ISZERO_STATE(AND_STATE(LOAD_FROM_ENG(&limex->repeatCyclicMask), s))) { + DEBUG_PRINTF("no cyclics are on\n"); + return; + } + + const union RepeatControl *ctrl = + getRepeatControlBaseConst((const char *)src, sizeof(STATE_T)); + char *state_base = (char *)dest + limex->stateSize; + + for (u32 i = 0; i < limex->repeatCount; i++) { + DEBUG_PRINTF("repeat %u\n", i); + const struct NFARepeatInfo *info = GET_NFA_REPEAT_INFO_FN(limex, i); + + const ENG_STATE_T *tug_mask = + (const ENG_STATE_T *)((const char *)info + info->tugMaskOffset); + /* repeat may still be inspected if its tug state is on */ + if (!TESTBIT_STATE(s, info->cyclicState) + && ISZERO_STATE(AND_STATE(s, LOAD_FROM_ENG(tug_mask)))) { + DEBUG_PRINTF("is dead\n"); + continue; + } + + const struct RepeatInfo *repeat = getRepeatInfo(info); + DEBUG_PRINTF("packing state (packedCtrlOffset=%u)\n", + info->packedCtrlOffset); + repeatPack(state_base + info->packedCtrlOffset, repeat, &ctrl[i], + offset); + } + + *(STATE_T *)src = s; +} + +char JOIN(LIMEX_API_ROOT, _queueCompressState)(const struct NFA *n, + const struct mq *q, s64a loc) { + void *dest = q->streamState; + void *src = q->state; + u8 key = queue_prev_byte(q, loc); + const IMPL_NFA_T *limex = getImplNfa(n); + COMPRESS_REPEATS_FN(limex, dest, src, q->offset + loc); + COMPRESS_FN(limex, dest, src, key); + return 0; +} + +static really_inline +void EXPAND_REPEATS_FN(const IMPL_NFA_T *limex, void *dest, const void *src, + u64a offset) { + if (!limex->repeatCount) { + return; + } + + // Note: state has already been expanded into 'dest'. + const STATE_T cyclics = + AND_STATE(*(STATE_T *)dest, LOAD_FROM_ENG(&limex->repeatCyclicMask)); + if (ISZERO_STATE(cyclics)) { + DEBUG_PRINTF("no cyclics are on\n"); + return; + } + + union RepeatControl *ctrl = + getRepeatControlBase((char *)dest, sizeof(STATE_T)); + const char *state_base = (const char *)src + limex->stateSize; + + for (u32 i = 0; i < limex->repeatCount; i++) { + DEBUG_PRINTF("repeat %u\n", i); + const struct NFARepeatInfo *info = GET_NFA_REPEAT_INFO_FN(limex, i); + const ENG_STATE_T *tug_mask = + (const ENG_STATE_T *)((const char *)info + info->tugMaskOffset); + + if (!TESTBIT_STATE(cyclics, info->cyclicState) + && ISZERO_STATE(AND_STATE(cyclics, LOAD_FROM_ENG(tug_mask)))) { + DEBUG_PRINTF("is dead\n"); + continue; + } + + DEBUG_PRINTF("unpacking state (packedCtrlOffset=%u)\n", + info->packedCtrlOffset); + const struct RepeatInfo *repeat = getRepeatInfo(info); + repeatUnpack(state_base + info->packedCtrlOffset, repeat, offset, + &ctrl[i]); + } +} + +char JOIN(LIMEX_API_ROOT, _expandState)(const struct NFA *n, void *dest, + const void *src, u64a offset, + u8 key) { + const IMPL_NFA_T *limex = getImplNfa(n); + EXPAND_FN(limex, dest, src, key); + EXPAND_REPEATS_FN(limex, dest, src, offset); + return 0; +} + +char JOIN(LIMEX_API_ROOT, _queueInitState)(const struct NFA *n, struct mq *q) { + *(STATE_T *)q->state = ZERO_STATE; + + // Zero every bounded repeat control block in state. + const IMPL_NFA_T *limex = getImplNfa(n); + union RepeatControl *ctrl = getRepeatControlBase(q->state, sizeof(STATE_T)); + for (u32 i = 0; i < limex->repeatCount; i++) { + memset(&ctrl[i], 0, sizeof(*ctrl)); + } + + return 0; +} + +char JOIN(LIMEX_API_ROOT, _initCompressedState)(const struct NFA *n, + u64a offset, void *state, + u8 key) { + const IMPL_NFA_T *limex = getImplNfa(n); + + STATE_T s = INITIAL_FN(limex, !!offset); + if (ISZERO_STATE(s)) { + DEBUG_PRINTF("state went to zero\n"); + return 0; + } + + // NFA is still active, compress its state and ship it out. + COMPRESS_FN(limex, state, &s, key); + + // Zero every packed bounded repeat control block in stream state. + char *repeat_region = (char *)state + limex->stateSize; + for (u32 i = 0; i < limex->repeatCount; i++) { + const struct NFARepeatInfo *info = GET_NFA_REPEAT_INFO_FN(limex, i); + const struct RepeatInfo *repeat = getRepeatInfo(info); + + memset(repeat_region + info->packedCtrlOffset, 0, + repeat->packedCtrlSize); + } + + return 1; +} + +// Helper for history buffer scans, which catch up the NFA state but don't emit +// matches. +static never_inline +void STREAMSILENT_FN(const IMPL_NFA_T *limex, const u8 *input, size_t length, + struct CONTEXT_T *ctx, u64a offset) { + const char first_match = 0; + + UNUSED char rv = STREAM_FN(limex, input, length, ctx, offset, NO_OUTPUT, + NULL, first_match); + assert(rv != MO_HALT_MATCHING); +} + +static never_inline +char STREAMCB_FN(const IMPL_NFA_T *limex, const u8 *input, size_t length, + struct CONTEXT_T *ctx, u64a offset) { + const char first_match = 0; + assert(ISALIGNED_CL(ctx)); + return STREAM_FN(limex, input, length, ctx, offset, CALLBACK_OUTPUT, NULL, + first_match); +} + +static never_inline +char STREAMFIRST_FN(const IMPL_NFA_T *limex, const u8 *input, size_t length, + struct CONTEXT_T *ctx, u64a offset, u64a *final_loc) { + const char first_match = 1; // Run to first match and stop, no callbacks. + return STREAM_FN(limex, input, length, ctx, offset, NO_OUTPUT, final_loc, + first_match); +} + +// Common code for handling the current event on the queue. +static really_inline +void JOIN(LIMEX_API_ROOT, _HandleEvent)(const IMPL_NFA_T *limex, + struct mq *q, struct CONTEXT_T *ctx, + u64a sp) { +#define DEFINE_CASE(ee) \ + case ee: \ + DEBUG_PRINTF(#ee "\n"); + + u32 e = q->items[q->cur].type; + switch (e) { + DEFINE_CASE(MQE_TOP) + ctx->s = TOP_FN(limex, !!sp, ctx->s); + break; + DEFINE_CASE(MQE_START) + break; + DEFINE_CASE(MQE_END) + break; + default: + assert(e >= MQE_TOP_FIRST); + assert(e < MQE_INVALID); + DEBUG_PRINTF("MQE_TOP + %d\n", ((int)e - MQE_TOP_FIRST)); + ctx->s = TOPN_FN(limex, ctx->s, e - MQE_TOP_FIRST); + } +#undef DEFINE_CASE +} + +// "Classic" queue call, used by outfixes +char JOIN(LIMEX_API_ROOT, _Q)(const struct NFA *n, struct mq *q, s64a end) { + const IMPL_NFA_T *limex = getImplNfa(n); + + if (q->report_current) { + char rv = REPORTCURRENT_FN(limex, q); + + q->report_current = 0; + + if (rv == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } + + if (q->cur == q->end) { + return 1; + } + + assert(q->cur + 1 < q->end); /* require at least two items */ + + struct CONTEXT_T ctx; + ctx.repeat_ctrl = getRepeatControlBase(q->state, sizeof(STATE_T)); + ctx.repeat_state = q->streamState + limex->stateSize; + ctx.callback = q->cb; + ctx.context = q->context; + ctx.cached_estate = ZERO_STATE; + ctx.cached_br = 0; + + assert(q->items[q->cur].location >= 0); + DEBUG_PRINTF("LOAD STATE\n"); + ctx.s = *(STATE_T *)q->state; + assert(q->items[q->cur].type == MQE_START); + + u64a offset = q->offset; + u64a sp = offset + q->items[q->cur].location; + u64a end_abs = offset + end; + q->cur++; + + while (q->cur < q->end && sp <= end_abs) { + u64a ep = offset + q->items[q->cur].location; + ep = MIN(ep, end_abs); + assert(ep >= sp); + + assert(sp >= offset); // We no longer do history buffer scans here. + + if (sp >= ep) { + goto scan_done; + } + + /* do main buffer region */ + DEBUG_PRINTF("MAIN BUFFER SCAN\n"); + assert(ep - offset <= q->length); + if (STREAMCB_FN(limex, q->buffer + sp - offset, ep - sp, &ctx, sp) + == MO_HALT_MATCHING) { + *(STATE_T *)q->state = ZERO_STATE; + return 0; + } + + DEBUG_PRINTF("SCAN DONE\n"); + scan_done: + sp = ep; + + if (sp != offset + q->items[q->cur].location) { + assert(q->cur); + DEBUG_PRINTF("bail: sp = %llu end_abs == %llu offset == %llu\n", + sp, end_abs, offset); + assert(sp == end_abs); + q->cur--; + q->items[q->cur].type = MQE_START; + q->items[q->cur].location = sp - offset; + DEBUG_PRINTF("bailing q->cur %u q->end %u\n", q->cur, q->end); + *(STATE_T *)q->state = ctx.s; + return MO_ALIVE; + } + + JOIN(LIMEX_API_ROOT, _HandleEvent)(limex, q, &ctx, sp); + + q->cur++; + } + + EXPIRE_ESTATE_FN(limex, &ctx, sp); + + DEBUG_PRINTF("END\n"); + *(STATE_T *)q->state = ctx.s; + + if (q->cur != q->end) { + q->cur--; + q->items[q->cur].type = MQE_START; + q->items[q->cur].location = sp - offset; + return MO_ALIVE; + } + + return ISNONZERO_STATE(ctx.s); +} + +/* used by suffix execution in Rose */ +char JOIN(LIMEX_API_ROOT, _Q2)(const struct NFA *n, struct mq *q, s64a end) { + const IMPL_NFA_T *limex = getImplNfa(n); + + if (q->report_current) { + char rv = REPORTCURRENT_FN(limex, q); + + q->report_current = 0; + + if (rv == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } + + if (q->cur == q->end) { + return 1; + } + + assert(q->cur + 1 < q->end); /* require at least two items */ + + struct CONTEXT_T ctx; + ctx.repeat_ctrl = getRepeatControlBase(q->state, sizeof(STATE_T)); + ctx.repeat_state = q->streamState + limex->stateSize; + ctx.callback = q->cb; + ctx.context = q->context; + ctx.cached_estate = ZERO_STATE; + ctx.cached_br = 0; + + DEBUG_PRINTF("LOAD STATE\n"); + ctx.s = *(STATE_T *)q->state; + assert(q->items[q->cur].type == MQE_START); + + u64a offset = q->offset; + u64a sp = offset + q->items[q->cur].location; + u64a end_abs = offset + end; + q->cur++; + + while (q->cur < q->end && sp <= end_abs) { + u64a ep = offset + q->items[q->cur].location; + DEBUG_PRINTF("sp = %llu, ep = %llu, end_abs = %llu\n", + sp, ep, end_abs); + ep = MIN(ep, end_abs); + assert(ep >= sp); + + if (sp < offset) { + DEBUG_PRINTF("HISTORY BUFFER SCAN\n"); + assert(offset - sp <= q->hlength); + u64a local_ep = MIN(offset, ep); + u64a final_look = 0; + /* we are starting inside the history buffer */ + if (STREAMFIRST_FN(limex, q->history + q->hlength + sp - offset, + local_ep - sp, &ctx, sp, + &final_look) == MO_HALT_MATCHING) { + DEBUG_PRINTF("final_look:%llu sp:%llu end_abs:%llu " + "offset:%llu\n", final_look, sp, end_abs, offset); + assert(q->cur); + q->cur--; + q->items[q->cur].type = MQE_START; + q->items[q->cur].location = sp + final_look - offset; + *(STATE_T *)q->state = ctx.s; + return MO_MATCHES_PENDING; + } + + sp = local_ep; + } + + if (sp >= ep) { + goto scan_done; + } + + /* do main buffer region */ + u64a final_look = 0; + assert(ep - offset <= q->length); + if (STREAMFIRST_FN(limex, q->buffer + sp - offset, ep - sp, &ctx, sp, + &final_look) == MO_HALT_MATCHING) { + DEBUG_PRINTF("final_look:%llu sp:%llu end_abs:%llu offset:%llu\n", + final_look, sp, end_abs, offset); + assert(q->cur); + q->cur--; + q->items[q->cur].type = MQE_START; + q->items[q->cur].location = sp + final_look - offset; + *(STATE_T *)q->state = ctx.s; + return MO_MATCHES_PENDING; + } + + scan_done: + sp = ep; + + if (sp != offset + q->items[q->cur].location) { + assert(q->cur); + DEBUG_PRINTF("bail: sp = %llu end_abs == %llu offset == %llu\n", + sp, end_abs, offset); + assert(sp == end_abs); + q->cur--; + q->items[q->cur].type = MQE_START; + q->items[q->cur].location = sp - offset; + DEBUG_PRINTF("bailing q->cur %u q->end %u\n", q->cur, q->end); + *(STATE_T *)q->state = ctx.s; + return MO_ALIVE; + } + + JOIN(LIMEX_API_ROOT, _HandleEvent)(limex, q, &ctx, sp); + + q->cur++; + } + + EXPIRE_ESTATE_FN(limex, &ctx, sp); + + DEBUG_PRINTF("END\n"); + *(STATE_T *)q->state = ctx.s; + + if (q->cur != q->end) { + q->cur--; + q->items[q->cur].type = MQE_START; + q->items[q->cur].location = sp - offset; + return MO_ALIVE; + } + + return ISNONZERO_STATE(ctx.s); +} + +// Used for execution Rose prefix/infixes. +char JOIN(LIMEX_API_ROOT, _QR)(const struct NFA *n, struct mq *q, + ReportID report) { + const IMPL_NFA_T *limex = getImplNfa(n); + + if (q->cur == q->end) { + return 1; + } + + assert(q->cur + 1 < q->end); /* require at least two items */ + + struct CONTEXT_T ctx; + ctx.repeat_ctrl = getRepeatControlBase(q->state, sizeof(STATE_T)); + ctx.repeat_state = q->streamState + limex->stateSize; + ctx.callback = NULL; + ctx.context = NULL; + ctx.cached_estate = ZERO_STATE; + ctx.cached_br = 0; + + DEBUG_PRINTF("LOAD STATE\n"); + ctx.s = *(STATE_T *)q->state; + assert(q->items[q->cur].type == MQE_START); + + u64a offset = q->offset; + u64a sp = offset + q->items[q->cur].location; + q->cur++; + + while (q->cur < q->end) { + u64a ep = offset + q->items[q->cur].location; + if (n->maxWidth) { + if (ep - sp > n->maxWidth) { + sp = ep - n->maxWidth; + ctx.s = INITIAL_FN(limex, !!sp); + } + } + assert(ep >= sp); + + if (sp < offset) { + DEBUG_PRINTF("HISTORY BUFFER SCAN\n"); + assert(offset - sp <= q->hlength); + u64a local_ep = MIN(offset, ep); + /* we are starting inside the history buffer */ + STREAMSILENT_FN(limex, q->history + q->hlength + sp - offset, + local_ep - sp, &ctx, sp); + + sp = local_ep; + } + + if (sp >= ep) { + goto scan_done; + } + + /* do main buffer region */ + DEBUG_PRINTF("MAIN BUFFER SCAN\n"); + assert(ep - offset <= q->length); + STREAMSILENT_FN(limex, q->buffer + sp - offset, ep - sp, &ctx, sp); + + DEBUG_PRINTF("SCAN DONE\n"); + scan_done: + sp = ep; + + JOIN(LIMEX_API_ROOT, _HandleEvent)(limex, q, &ctx, sp); + + q->cur++; + } + + EXPIRE_ESTATE_FN(limex, &ctx, sp); + + DEBUG_PRINTF("END, nfa is %s\n", + ISNONZERO_STATE(ctx.s) ? "still alive" : "dead"); + + *(STATE_T *)q->state = ctx.s; + + if (JOIN(limexInAccept, SIZE)(limex, ctx.s, ctx.repeat_ctrl, + ctx.repeat_state, sp + 1, report)) { + return MO_MATCHES_PENDING; + } + + return ISNONZERO_STATE(ctx.s); +} + +char JOIN(LIMEX_API_ROOT, _testEOD)(const struct NFA *n, const char *state, + const char *streamState, u64a offset, + NfaCallback callback, void *context) { + assert(n && state); + + const IMPL_NFA_T *limex = getImplNfa(n); + const STATE_T *sptr = (const STATE_T *)state; + const union RepeatControl *repeat_ctrl = + getRepeatControlBaseConst(state, sizeof(STATE_T)); + const char *repeat_state = streamState + limex->stateSize; + return TESTEOD_FN(limex, sptr, repeat_ctrl, repeat_state, offset, callback, + context); +} + +char JOIN(LIMEX_API_ROOT, _reportCurrent)(const struct NFA *n, struct mq *q) { + const IMPL_NFA_T *limex = getImplNfa(n); + REPORTCURRENT_FN(limex, q); + return 1; +} + +// Block mode reverse scan. +char JOIN(LIMEX_API_ROOT, _B_Reverse)(const struct NFA *n, u64a offset, + const u8 *buf, size_t buflen, + const u8 *hbuf, size_t hlen, + NfaCallback cb, void *context) { + assert(buf || hbuf); + assert(buflen || hlen); + + struct CONTEXT_T ctx; + ctx.repeat_ctrl = NULL; + ctx.repeat_state = NULL; + ctx.callback = cb; + ctx.context = context; + ctx.cached_estate = ZERO_STATE; + ctx.cached_br = 0; + + const IMPL_NFA_T *limex = getImplNfa(n); + ctx.s = INITIAL_FN(limex, 0); // always anchored + + // 'buf' may be null, for example when we're scanning at EOD time. + if (buflen) { + assert(buf); + DEBUG_PRINTF("MAIN BUFFER SCAN, %zu bytes\n", buflen); + offset -= buflen; + REV_STREAM_FN(limex, buf, buflen, &ctx, offset); + } + + if (hlen) { + assert(hbuf); + DEBUG_PRINTF("HISTORY BUFFER SCAN, %zu bytes\n", hlen); + offset -= hlen; + REV_STREAM_FN(limex, hbuf, hlen, &ctx, offset); + } + + if (offset == 0 && limex->acceptEodCount && ISNONZERO_STATE(ctx.s)) { + const union RepeatControl *repeat_ctrl = NULL; + const char *repeat_state = NULL; + TESTEOD_FN(limex, &ctx.s, repeat_ctrl, repeat_state, offset, cb, + context); + } + + // NOTE: return value is unused. + return 0; +} + +char JOIN(LIMEX_API_ROOT, _inAccept)(const struct NFA *nfa, + ReportID report, struct mq *q) { + assert(nfa && q); + assert(q->state && q->streamState); + + const IMPL_NFA_T *limex = getImplNfa(nfa); + union RepeatControl *repeat_ctrl = + getRepeatControlBase(q->state, sizeof(STATE_T)); + char *repeat_state = q->streamState + limex->stateSize; + STATE_T state = *(STATE_T *)q->state; + u64a offset = q->offset + q_last_loc(q) + 1; + + return JOIN(limexInAccept, SIZE)(limex, state, repeat_ctrl, repeat_state, + offset, report); +} + +char JOIN(LIMEX_API_ROOT, _inAnyAccept)(const struct NFA *nfa, struct mq *q) { + assert(nfa && q); + assert(q->state && q->streamState); + + const IMPL_NFA_T *limex = getImplNfa(nfa); + union RepeatControl *repeat_ctrl = + getRepeatControlBase(q->state, sizeof(STATE_T)); + char *repeat_state = q->streamState + limex->stateSize; + STATE_T state = *(STATE_T *)q->state; + u64a offset = q->offset + q_last_loc(q) + 1; + + return JOIN(limexInAnyAccept, SIZE)(limex, state, repeat_ctrl, repeat_state, + offset); +} + +enum nfa_zombie_status JOIN(LIMEX_API_ROOT, _zombie_status)( + const struct NFA *nfa, + struct mq *q, + s64a loc) { + assert(nfa->flags & NFA_ZOMBIE); + const IMPL_NFA_T *limex = getImplNfa(nfa); + STATE_T state = *(STATE_T *)q->state; + STATE_T zmask = LOAD_FROM_ENG(&limex->zombieMask); + + if (limex->repeatCount) { + u64a offset = q->offset + loc + 1; + union RepeatControl *repeat_ctrl = + getRepeatControlBase(q->state, sizeof(STATE_T)); + char *repeat_state = q->streamState + limex->stateSize; + SQUASH_UNTUG_BR_FN(limex, repeat_ctrl, repeat_state, offset, &state); + } + + if (ISNONZERO_STATE(AND_STATE(state, zmask))) { + return NFA_ZOMBIE_ALWAYS_YES; + } + + return NFA_ZOMBIE_NO; +} + +#undef TESTEOD_FN +#undef INITIAL_FN +#undef TOP_FN +#undef TOPN_FN +#undef REPORTCURRENT_FN +#undef COMPRESS_FN +#undef EXPAND_FN +#undef COMPRESS_REPEATS_FN +#undef EXPAND_REPEATS_FN +#undef PROCESS_ACCEPTS_FN +#undef PROCESS_ACCEPTS_NOSQUASH_FN +#undef GET_NFA_REPEAT_INFO_FN +#undef RUN_ACCEL_FN +#undef RUN_EXCEPTIONS_FN +#undef REV_STREAM_FN +#undef LOOP_NOACCEL_FN +#undef STREAM_FN +#undef STREAMCB_FN +#undef STREAMFIRST_FN +#undef STREAMSILENT_FN +#undef CONTEXT_T +#undef EXCEPTION_T +#undef AND_STATE +#undef ANDNOT_STATE +#undef OR_STATE +#undef LSHIFT_STATE +#undef TESTBIT_STATE +#undef CLEARBIT_STATE +#undef ZERO_STATE +#undef ISNONZERO_STATE +#undef ISZERO_STATE +#undef NOTEQ_STATE +#undef DIFFRICH_STATE +#undef INLINE_ATTR_INT +#undef IMPL_NFA_T +#undef SQUASH_UNTUG_BR_FN +#undef ACCEL_MASK +#undef ACCEL_AND_FRIENDS_MASK +#undef EXCEPTION_MASK +#undef LIMEX_API_ROOT diff --git a/regex/nfa/limex_shuffle.h b/regex/nfa/limex_shuffle.h new file mode 100644 index 000000000..365d47296 --- /dev/null +++ b/regex/nfa/limex_shuffle.h @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Naive dynamic shuffles. + * + * These are written with the assumption that the provided masks are sparsely + * populated and never contain more than 32 on bits. Other implementations will + * be faster and actually correct if these assumptions don't hold true. + */ + +#ifndef LIMEX_SHUFFLE_H +#define LIMEX_SHUFFLE_H + +#include "ue2common.h" +#include "util/arch.h" +#include "util/bitutils.h" +#include "util/simd_utils.h" + +static really_inline +u32 packedExtract128(m128 s, const m128 permute, const m128 compare) { + m128 shuffled = pshufb_m128(s, permute); + m128 compared = and128(shuffled, compare); + u16 rv = ~movemask128(eq128(compared, shuffled)); + return (u32)rv; +} + +#if defined(HAVE_AVX2) +static really_inline +u32 packedExtract256(m256 s, const m256 permute, const m256 compare) { + // vpshufb doesn't cross lanes, so this is a bit of a cheat + m256 shuffled = pshufb_m256(s, permute); + m256 compared = and256(shuffled, compare); + u32 rv = ~movemask256(eq256(compared, shuffled)); + // stitch the lane-wise results back together + return (u32)((rv >> 16) | (rv & 0xffffU)); +} +#endif // AVX2 + +#if defined(HAVE_AVX512) +static really_inline +u32 packedExtract512(m512 s, const m512 permute, const m512 compare) { + // vpshufb doesn't cross lanes, so this is a bit of a cheat + m512 shuffled = pshufb_m512(s, permute); + m512 compared = and512(shuffled, compare); + u64a rv = ~eq512mask(compared, shuffled); + // stitch the lane-wise results back together + rv = rv >> 32 | rv; + return (u32)(((rv >> 16) | rv) & 0xffffU); +} +#endif // AVX512 + +#endif // LIMEX_SHUFFLE_H diff --git a/regex/nfa/limex_simd128.c b/regex/nfa/limex_simd128.c new file mode 100644 index 000000000..c5f2b33e3 --- /dev/null +++ b/regex/nfa/limex_simd128.c @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2015-2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief LimEx NFA: 128-bit SIMD runtime implementations. + */ + +//#define DEBUG_INPUT +//#define DEBUG_EXCEPTIONS + +#include "limex.h" + +#include "accel.h" +#include "limex_internal.h" +#include "nfa_internal.h" +#include "ue2common.h" +#include "util/bitutils.h" +#include "util/simd_utils.h" + +// Common code +#define STATE_ON_STACK +#define ESTATE_ON_STACK + +#include "limex_runtime.h" + +#define SIZE 128 +#define STATE_T m128 +#define ENG_STATE_T m128 +#define LOAD_FROM_ENG load_m128 + +#include "limex_exceptional.h" + +#include "limex_state_impl.h" + +#define INLINE_ATTR really_inline +#include "limex_common_impl.h" + +#include "limex_runtime_impl.h" diff --git a/regex/nfa/limex_simd256.c b/regex/nfa/limex_simd256.c new file mode 100644 index 000000000..cc2329081 --- /dev/null +++ b/regex/nfa/limex_simd256.c @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2015-2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief LimEx NFA: 256-bit SIMD runtime implementations. + */ + +//#define DEBUG_INPUT +//#define DEBUG_EXCEPTIONS + +#include "limex.h" + +#include "accel.h" +#include "limex_internal.h" +#include "nfa_internal.h" +#include "ue2common.h" +#include "util/bitutils.h" +#include "util/simd_utils.h" + +// Common code +#include "limex_runtime.h" + +#define SIZE 256 +#define STATE_T m256 +#define ENG_STATE_T m256 +#define LOAD_FROM_ENG load_m256 + +#include "limex_exceptional.h" + +#include "limex_state_impl.h" + +#define INLINE_ATTR really_inline +#include "limex_common_impl.h" + +#include "limex_runtime_impl.h" diff --git a/regex/nfa/limex_simd384.c b/regex/nfa/limex_simd384.c new file mode 100644 index 000000000..7e596e48b --- /dev/null +++ b/regex/nfa/limex_simd384.c @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2015-2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief LimEx NFA: 384-bit SIMD runtime implementations. + */ + +//#define DEBUG_INPUT +//#define DEBUG_EXCEPTIONS + +#include "limex.h" + +#include "accel.h" +#include "limex_internal.h" +#include "nfa_internal.h" +#include "ue2common.h" +#include "util/bitutils.h" +#include "util/simd_utils.h" + +// Common code +#include "limex_runtime.h" + +#define SIZE 384 +#define STATE_T m384 +#define ENG_STATE_T m384 +#define LOAD_FROM_ENG load_m384 + +#include "limex_exceptional.h" + +#include "limex_state_impl.h" + +#define INLINE_ATTR really_inline +#include "limex_common_impl.h" + +#include "limex_runtime_impl.h" diff --git a/regex/nfa/limex_simd512.c b/regex/nfa/limex_simd512.c new file mode 100644 index 000000000..f779f335d --- /dev/null +++ b/regex/nfa/limex_simd512.c @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2015-2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief LimEx NFA: 512-bit SIMD runtime implementations. + */ + +//#define DEBUG_INPUT +//#define DEBUG_EXCEPTIONS + +#include "limex.h" + +#include "accel.h" +#include "limex_internal.h" +#include "nfa_internal.h" +#include "ue2common.h" +#include "util/bitutils.h" +#include "util/simd_utils.h" + +// Common code +#include "limex_runtime.h" + +#define SIZE 512 +#define STATE_T m512 +#define ENG_STATE_T m512 +#define LOAD_FROM_ENG load_m512 + +#include "limex_exceptional.h" + +#include "limex_state_impl.h" + +#define INLINE_ATTR really_inline +#include "limex_common_impl.h" + +#include "limex_runtime_impl.h" diff --git a/regex/nfa/limex_state_impl.h b/regex/nfa/limex_state_impl.h new file mode 100644 index 000000000..732874047 --- /dev/null +++ b/regex/nfa/limex_state_impl.h @@ -0,0 +1,145 @@ +/* + * Copyright (c) 2015-2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief NFA stream state handling. + */ + +#include "util/join.h" +#include "util/partial_store.h" +#include "util/state_compress.h" +#ifndef __KERNEL__ +#include +#else +#include +#endif + +#if !defined(SIZE) || !defined(STATE_T) || !defined(LOAD_FROM_ENG) +# error Must define SIZE, STATE_T, LOAD_FROM_ENG in includer. +#endif + +#define IMPL_NFA_T JOIN(struct LimExNFA, SIZE) +#define COMMON_T JOIN(NFACommon, SIZE) +#define REACHMASK_FN JOIN(moNfaReachMask, SIZE) +#define COMPRESS_FN JOIN(moNfaCompressState, SIZE) +#define EXPAND_FN JOIN(moNfaExpandState, SIZE) +#define COMPRESSED_STORE_FN JOIN(store_compressed_, STATE_T) +#define COMPRESSED_LOAD_FN JOIN(load_compressed_, STATE_T) +#define PARTIAL_STORE_FN JOIN(partial_store_, STATE_T) +#define PARTIAL_LOAD_FN JOIN(partial_load_, STATE_T) +#define OR_STATE JOIN(or_, STATE_T) +#define AND_STATE JOIN(and_, STATE_T) +#define ISZERO_STATE JOIN(isZero_, STATE_T) + +static really_inline +const ENG_STATE_T *get_reach_table(const IMPL_NFA_T *limex) { + const ENG_STATE_T *reach + = (const ENG_STATE_T *)((const char *)limex + sizeof(*limex)); + assert(ISALIGNED_N(reach, alignof(ENG_STATE_T))); + return reach; +} + +static really_inline +STATE_T REACHMASK_FN(const IMPL_NFA_T *limex, const u8 key) { + const ENG_STATE_T *reach = get_reach_table(limex); + return LOAD_FROM_ENG(&reach[limex->reachMap[key]]); +} + +static really_inline +void COMPRESS_FN(const IMPL_NFA_T *limex, u8 *dest, const STATE_T *src, + u8 key) { + assert(ISALIGNED_N(src, alignof(STATE_T))); + STATE_T a_src = *src; + + DEBUG_PRINTF("compress state: %p -> %p\n", src, dest); + + if (!(limex->flags & LIMEX_FLAG_COMPRESS_STATE)) { + // No key-based compression, just a partial store. + DEBUG_PRINTF("store state into %u bytes\n", limex->stateSize); + PARTIAL_STORE_FN(dest, a_src, limex->stateSize); + } else { + DEBUG_PRINTF("compress state, key=%hhx\n", key); + + STATE_T reachmask = REACHMASK_FN(limex, key); + + // Masked compression means that we mask off the initDs states and + // provide a shortcut for the all-zeroes case. Note that these must be + // switched on in the EXPAND call below. + if (limex->flags & LIMEX_FLAG_COMPRESS_MASKED) { + STATE_T s = AND_STATE(LOAD_FROM_ENG(&limex->compressMask), a_src); + if (ISZERO_STATE(s)) { + DEBUG_PRINTF("after compression mask, all states are zero\n"); + memset(dest, 0, limex->stateSize); + return; + } + + STATE_T mask = AND_STATE(LOAD_FROM_ENG(&limex->compressMask), + reachmask); + COMPRESSED_STORE_FN(dest, &s, &mask, limex->stateSize); + } else { + COMPRESSED_STORE_FN(dest, src, &reachmask, limex->stateSize); + } + } +} + +static really_inline +void EXPAND_FN(const IMPL_NFA_T *limex, STATE_T *dest, const u8 *src, u8 key) { + assert(ISALIGNED_N(dest, alignof(STATE_T))); + DEBUG_PRINTF("expand state: %p -> %p\n", src, dest); + + if (!(limex->flags & LIMEX_FLAG_COMPRESS_STATE)) { + // No key-based compression, just a partial load. + DEBUG_PRINTF("load state from %u bytes\n", limex->stateSize); + *dest = PARTIAL_LOAD_FN(src, limex->stateSize); + } else { + DEBUG_PRINTF("expand state, key=%hhx\n", key); + STATE_T reachmask = REACHMASK_FN(limex, key); + + if (limex->flags & LIMEX_FLAG_COMPRESS_MASKED) { + STATE_T mask = AND_STATE(LOAD_FROM_ENG(&limex->compressMask), + reachmask); + COMPRESSED_LOAD_FN(dest, src, &mask, limex->stateSize); + *dest = OR_STATE(LOAD_FROM_ENG(&limex->initDS), *dest); + } else { + COMPRESSED_LOAD_FN(dest, src, &reachmask, limex->stateSize); + } + } +} + +#undef IMPL_NFA_T +#undef COMMON_T +#undef REACHMASK_FN +#undef COMPRESS_FN +#undef EXPAND_FN +#undef COMPRESSED_STORE_FN +#undef COMPRESSED_LOAD_FN +#undef PARTIAL_STORE_FN +#undef PARTIAL_LOAD_FN +#undef OR_STATE +#undef AND_STATE +#undef ISZERO_STATE diff --git a/regex/nfa/mcclellan.c b/regex/nfa/mcclellan.c new file mode 100644 index 000000000..71f71e327 --- /dev/null +++ b/regex/nfa/mcclellan.c @@ -0,0 +1,1350 @@ +/* + * Copyright (c) 2015-2018, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "mcclellan.h" + +#include "accel.h" +#include "mcclellan_internal.h" +#include "nfa_api.h" +#include "nfa_api_queue.h" +#include "nfa_internal.h" +#include "util/bitutils.h" +#include "util/compare.h" +#include "util/simd_utils.h" +#include "ue2common.h" + +#include "mcclellan_common_impl.h" + +static really_inline +char doComplexReport(NfaCallback cb, void *ctxt, const struct mcclellan *m, + u32 s, u64a loc, char eod, u32 *cached_accept_state, + u32 *cached_accept_id) { + DEBUG_PRINTF("reporting state = %u, loc=%llu, eod %hhu\n", + s & STATE_MASK, loc, eod); + + if (!eod && s == *cached_accept_state) { + if (cb(0, loc, *cached_accept_id, ctxt) == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; /* termination requested */ + } + + return MO_CONTINUE_MATCHING; /* continue execution */ + } + + const struct mstate_aux *aux = get_aux(m, s); + size_t offset = eod ? aux->accept_eod : aux->accept; + + assert(offset); + const struct report_list *rl + = (const void *)((const char *)m + offset - sizeof(struct NFA)); + assert(ISALIGNED(rl)); + + DEBUG_PRINTF("report list size %u\n", rl->count); + u32 count = rl->count; + + if (!eod && count == 1) { + *cached_accept_state = s; + *cached_accept_id = rl->report[0]; + + DEBUG_PRINTF("reporting %u\n", rl->report[0]); + if (cb(0, loc, rl->report[0], ctxt) == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; /* termination requested */ + } + + return MO_CONTINUE_MATCHING; /* continue execution */ + } + + for (u32 i = 0; i < count; i++) { + DEBUG_PRINTF("reporting %u\n", rl->report[i]); + if (cb(0, loc, rl->report[i], ctxt) == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; /* termination requested */ + } + } + + return MO_CONTINUE_MATCHING; /* continue execution */ +} + +static really_inline +const u8 *run_mcclellan_accel(const struct mcclellan *m, + const struct mstate_aux *aux, u32 s, + const u8 **min_accel_offset, + const u8 *c, const u8 *c_end) { + DEBUG_PRINTF("skipping\n"); + u32 accel_offset = aux[s].accel_offset; + + assert(aux[s].accel_offset); + assert(accel_offset >= m->aux_offset); + assert(!m->sherman_offset || accel_offset < m->sherman_offset); + + const union AccelAux *aaux = (const void *)((const char *)m + accel_offset); + const u8 *c2 = run_accel(aaux, c, c_end); + + if (c2 < *min_accel_offset + BAD_ACCEL_DIST) { + *min_accel_offset = c2 + BIG_ACCEL_PENALTY; + } else { + *min_accel_offset = c2 + SMALL_ACCEL_PENALTY; + } + + if (*min_accel_offset >= c_end - ACCEL_MIN_LEN) { + *min_accel_offset = c_end; + } + + DEBUG_PRINTF("advanced %zd, next accel chance in %zd/%zd\n", + c2 - c, *min_accel_offset - c2, c_end - c2); + + return c2; +} + +static really_inline +u32 doNormal16(const struct mcclellan *m, const u8 **c_inout, const u8 *end, + u32 s, char do_accel, enum MatchMode mode) { + const u8 *c = *c_inout; + + const u16 *succ_table + = (const u16 *)((const char *)m + sizeof(struct mcclellan)); + assert(ISALIGNED_N(succ_table, 2)); + u32 sherman_base = m->sherman_limit; + const char *sherman_base_offset + = (const char *)m - sizeof(struct NFA) + m->sherman_offset; + u32 as = m->alphaShift; + + s &= STATE_MASK; + + while (c < end && s) { + u8 cprime = m->remap[*c]; + DEBUG_PRINTF("c: %02hhx '%c' cp:%02hhx (s=%u)\n", *c, + ourisprint(*c) ? *c : '?', cprime, s); + if (s < sherman_base) { + DEBUG_PRINTF("doing normal\n"); + assert(s < m->state_count); + s = succ_table[(s << as) + cprime]; + } else { + const char *sherman_state + = findShermanState(m, sherman_base_offset, sherman_base, s); + DEBUG_PRINTF("doing sherman (%u)\n", s); + s = doSherman16(sherman_state, cprime, succ_table, as); + } + + DEBUG_PRINTF("s: %u (%u)\n", s, s & STATE_MASK); + c++; + + if (do_accel && (s & ACCEL_FLAG)) { + break; + } + if (mode != NO_MATCHES && (s & ACCEPT_FLAG)) { + break; + } + + s &= STATE_MASK; + } + + *c_inout = c; + return s; +} + +static really_inline +u32 doNormalWide16(const struct mcclellan *m, const u8 **c_inout, + const u8 *end, u32 s, char *qstate, u16 *offset, + char do_accel, enum MatchMode mode) { + const u8 *c = *c_inout; + + u32 wide_limit = m->wide_limit; + const char *wide_base + = (const char *)m - sizeof(struct NFA) + m->wide_offset; + + const u16 *succ_table + = (const u16 *)((const char *)m + sizeof(struct mcclellan)); + assert(ISALIGNED_N(succ_table, 2)); + u32 sherman_base = m->sherman_limit; + const char *sherman_base_offset + = (const char *)m - sizeof(struct NFA) + m->sherman_offset; + u32 as = m->alphaShift; + + s &= STATE_MASK; + + while (c < end && s) { + u8 cprime = m->remap[*c]; + DEBUG_PRINTF("c: %02hhx '%c' cp:%02hhx (s=%u) &c: %p\n", *c, + ourisprint(*c) ? *c : '?', cprime, s, c); + + if (unlikely(s >= wide_limit)) { + const char *wide_entry + = findWideEntry16(m, wide_base, wide_limit, s); + DEBUG_PRINTF("doing wide head (%u)\n", s); + s = doWide16(wide_entry, &c, end, m->remap, (u16 *)&s, qstate, + offset); + } else if (s >= sherman_base) { + const char *sherman_state + = findShermanState(m, sherman_base_offset, sherman_base, s); + DEBUG_PRINTF("doing sherman (%u)\n", s); + s = doSherman16(sherman_state, cprime, succ_table, as); + } else { + DEBUG_PRINTF("doing normal\n"); + s = succ_table[(s << as) + cprime]; + } + + DEBUG_PRINTF("s: %u (%u)\n", s, s & STATE_MASK); + c++; + + if (do_accel && (s & ACCEL_FLAG)) { + break; + } + if (mode != NO_MATCHES && (s & ACCEPT_FLAG)) { + break; + } + + s &= STATE_MASK; + } + + *c_inout = c; + return s; +} + +static really_inline +char mcclellanExec16_i(const struct mcclellan *m, u32 *state, char *qstate, + const u8 *buf, size_t len, u64a offAdj, NfaCallback cb, + void *ctxt, char single, const u8 **c_final, + enum MatchMode mode) { + assert(ISALIGNED_N(state, 2)); + if (!len) { + if (mode == STOP_AT_MATCH) { + *c_final = buf; + } + return MO_ALIVE; + } + + u32 s = *state; + u16 offset = 0; + const u8 *c = buf; + const u8 *c_end = buf + len; + const struct mstate_aux *aux + = (const struct mstate_aux *)((const char *)m + m->aux_offset + - sizeof(struct NFA)); + + s &= STATE_MASK; + + u32 cached_accept_id = 0; + u32 cached_accept_state = 0; + + DEBUG_PRINTF("s: %u, len %zu\n", s, len); + + const u8 *min_accel_offset = c; + if (!m->has_accel || len < ACCEL_MIN_LEN) { + min_accel_offset = c_end; + goto without_accel; + } + + goto with_accel; + +without_accel: + do { + assert(c < min_accel_offset); + if (!s) { + goto exit; + } + + if (unlikely(m->has_wide)) { + s = doNormalWide16(m, &c, min_accel_offset, s, qstate, &offset, 0, + mode); + } else { + s = doNormal16(m, &c, min_accel_offset, s, 0, mode); + } + + if (mode != NO_MATCHES && (s & ACCEPT_FLAG)) { + if (mode == STOP_AT_MATCH) { + *state = s & STATE_MASK; + *c_final = c - 1; + return MO_MATCHES_PENDING; + } + + u64a loc = (c - 1) - buf + offAdj + 1; + + if (single) { + DEBUG_PRINTF("reporting %u\n", m->arb_report); + if (cb(0, loc, m->arb_report, ctxt) == MO_HALT_MATCHING) { + return MO_DEAD; /* termination requested */ + } + } else if (doComplexReport(cb, ctxt, m, s & STATE_MASK, loc, 0, + &cached_accept_state, &cached_accept_id) + == MO_HALT_MATCHING) { + return MO_DEAD; + } + } + + assert(c <= min_accel_offset); + } while (c < min_accel_offset); + + s &= STATE_MASK; + + if (c == c_end) { + goto exit; + } else { + goto with_accel; + } + +with_accel: + do { + assert(c < c_end); + if (!s) { + goto exit; + } + + if (s & ACCEL_FLAG) { + DEBUG_PRINTF("skipping\n"); + s &= STATE_MASK; + c = run_mcclellan_accel(m, aux, s, &min_accel_offset, c, c_end); + if (c == c_end) { + goto exit; + } else { + goto without_accel; + } + } + + if (unlikely(m->has_wide)) { + s = doNormalWide16(m, &c, c_end, s, qstate, &offset, 1, mode); + } else { + s = doNormal16(m, &c, c_end, s, 1, mode); + } + + if (mode != NO_MATCHES && (s & ACCEPT_FLAG)) { + if (mode == STOP_AT_MATCH) { + *state = s & STATE_MASK; + *c_final = c - 1; + return MO_MATCHES_PENDING; + } + + u64a loc = (c - 1) - buf + offAdj + 1; + + if (single) { + DEBUG_PRINTF("reporting %u\n", m->arb_report); + if (cb(0, loc, m->arb_report, ctxt) == MO_HALT_MATCHING) { + return MO_DEAD; /* termination requested */ + } + } else if (doComplexReport(cb, ctxt, m, s & STATE_MASK, loc, 0, + &cached_accept_state, &cached_accept_id) + == MO_HALT_MATCHING) { + return MO_DEAD; + } + } + + assert(c <= c_end); + } while (c < c_end); + +exit: + s &= STATE_MASK; + + if (mode == STOP_AT_MATCH) { + *c_final = c_end; + } + *state = s; + + return MO_ALIVE; +} + +static never_inline +char mcclellanExec16_i_cb(const struct mcclellan *m, u32 *state, char *qstate, + const u8 *buf, size_t len, u64a offAdj, + NfaCallback cb, void *ctxt, char single, + const u8 **final_point) { + return mcclellanExec16_i(m, state, qstate, buf, len, offAdj, cb, ctxt, + single, final_point, CALLBACK_OUTPUT); +} + +static never_inline +char mcclellanExec16_i_sam(const struct mcclellan *m, u32 *state, char *qstate, + const u8 *buf, size_t len, u64a offAdj, + NfaCallback cb, void *ctxt, char single, + const u8 **final_point) { + return mcclellanExec16_i(m, state, qstate, buf, len, offAdj, cb, ctxt, + single, final_point, STOP_AT_MATCH); +} + +static never_inline +char mcclellanExec16_i_nm(const struct mcclellan *m, u32 *state, char *qstate, + const u8 *buf, size_t len, u64a offAdj, + NfaCallback cb, void *ctxt, char single, + const u8 **final_point) { + return mcclellanExec16_i(m, state, qstate, buf, len, offAdj, cb, ctxt, + single, final_point, NO_MATCHES); +} + +static really_inline +char mcclellanExec16_i_ni(const struct mcclellan *m, u32 *state, char *qstate, + const u8 *buf, size_t len, u64a offAdj, + NfaCallback cb, void *ctxt, char single, + const u8 **final_point, enum MatchMode mode) { + if (mode == CALLBACK_OUTPUT) { + return mcclellanExec16_i_cb(m, state, qstate, buf, len, offAdj, cb, + ctxt, single, final_point); + } else if (mode == STOP_AT_MATCH) { + return mcclellanExec16_i_sam(m, state, qstate, buf, len, offAdj, cb, + ctxt, single, final_point); + } else { + assert(mode == NO_MATCHES); + return mcclellanExec16_i_nm(m, state, qstate, buf, len, offAdj, cb, + ctxt, single, final_point); + } +} + +static really_inline +u32 doNormal8(const struct mcclellan *m, const u8 **c_inout, const u8 *end, + u32 s, char do_accel, enum MatchMode mode) { + const u8 *c = *c_inout; + u32 accel_limit = m->accel_limit_8; + u32 accept_limit = m->accept_limit_8; + + const u32 as = m->alphaShift; + const u8 *succ_table = (const u8 *)((const char *)m + + sizeof(struct mcclellan)); + while (c < end && s) { + u8 cprime = m->remap[*c]; + DEBUG_PRINTF("c: %02hhx '%c' cp:%02hhx\n", *c, + ourisprint(*c) ? *c : '?', cprime); + s = succ_table[(s << as) + cprime]; + + DEBUG_PRINTF("s: %u\n", s); + c++; + if (do_accel) { + if (s >= accel_limit) { + break; + } + } else { + if (mode != NO_MATCHES && s >= accept_limit) { + break; + } + } + } + *c_inout = c; + return s; +} + +static really_inline +char mcclellanExec8_i(const struct mcclellan *m, u32 *state, const u8 *buf, + size_t len, u64a offAdj, NfaCallback cb, void *ctxt, + char single, const u8 **c_final, enum MatchMode mode) { + if (!len) { + if (mode == STOP_AT_MATCH) { + *c_final = buf; + } + return MO_ALIVE; + } + u32 s = *state; + const u8 *c = buf; + const u8 *c_end = buf + len; + + const struct mstate_aux *aux + = (const struct mstate_aux *)((const char *)m + m->aux_offset + - sizeof(struct NFA)); + u32 accept_limit = m->accept_limit_8; + + u32 cached_accept_id = 0; + u32 cached_accept_state = 0; + + DEBUG_PRINTF("accel %hu, accept %u\n", m->accel_limit_8, accept_limit); + + DEBUG_PRINTF("s: %u, len %zu\n", s, len); + + const u8 *min_accel_offset = c; + if (!m->has_accel || len < ACCEL_MIN_LEN) { + min_accel_offset = c_end; + goto without_accel; + } + + goto with_accel; + +without_accel: + do { + assert(c < min_accel_offset); + if (!s) { + goto exit; + } + + s = doNormal8(m, &c, min_accel_offset, s, 0, mode); + + if (mode != NO_MATCHES && s >= accept_limit) { + if (mode == STOP_AT_MATCH) { + DEBUG_PRINTF("match - pausing\n"); + *state = s; + *c_final = c - 1; + return MO_MATCHES_PENDING; + } + + u64a loc = (c - 1) - buf + offAdj + 1; + if (single) { + DEBUG_PRINTF("reporting %u\n", m->arb_report); + if (cb(0, loc, m->arb_report, ctxt) == MO_HALT_MATCHING) { + return MO_DEAD; + } + } else if (doComplexReport(cb, ctxt, m, s, loc, 0, + &cached_accept_state, &cached_accept_id) + == MO_HALT_MATCHING) { + return MO_DEAD; + } + } + + assert(c <= min_accel_offset); + } while (c < min_accel_offset); + + if (c == c_end) { + goto exit; + } + +with_accel: + do { + u32 accel_limit = m->accel_limit_8; + assert(c < c_end); + + if (!s) { + goto exit; + } + + if (s >= accel_limit && aux[s].accel_offset) { + c = run_mcclellan_accel(m, aux, s, &min_accel_offset, c, c_end); + if (c == c_end) { + goto exit; + } else { + goto without_accel; + } + } + s = doNormal8(m, &c, c_end, s, 1, mode); + + if (mode != NO_MATCHES && s >= accept_limit) { + if (mode == STOP_AT_MATCH) { + DEBUG_PRINTF("match - pausing\n"); + *state = s; + *c_final = c - 1; + return MO_MATCHES_PENDING; + } + + u64a loc = (c - 1) - buf + offAdj + 1; + if (single) { + DEBUG_PRINTF("reporting %u\n", m->arb_report); + if (cb(0, loc, m->arb_report, ctxt) == MO_HALT_MATCHING) { + return MO_DEAD; + } + } else if (doComplexReport(cb, ctxt, m, s, loc, 0, + &cached_accept_state, &cached_accept_id) + == MO_HALT_MATCHING) { + return MO_DEAD; + } + } + + assert(c <= c_end); + } while (c < c_end); + +exit: + *state = s; + if (mode == STOP_AT_MATCH) { + *c_final = c_end; + } + return MO_ALIVE; +} + +static never_inline +char mcclellanExec8_i_cb(const struct mcclellan *m, u32 *state, const u8 *buf, + size_t len, u64a offAdj, NfaCallback cb, void *ctxt, + char single, const u8 **final_point) { + return mcclellanExec8_i(m, state, buf, len, offAdj, cb, ctxt, single, + final_point, CALLBACK_OUTPUT); +} + +static never_inline +char mcclellanExec8_i_sam(const struct mcclellan *m, u32 *state, const u8 *buf, + size_t len, u64a offAdj, NfaCallback cb, void *ctxt, + char single, const u8 **final_point) { + return mcclellanExec8_i(m, state, buf, len, offAdj, cb, ctxt, single, + final_point, STOP_AT_MATCH); +} + +static never_inline +char mcclellanExec8_i_nm(const struct mcclellan *m, u32 *state, const u8 *buf, + size_t len, u64a offAdj, NfaCallback cb, void *ctxt, + char single, const u8 **final_point) { + return mcclellanExec8_i(m, state, buf, len, offAdj, cb, ctxt, single, + final_point, NO_MATCHES); +} + +static really_inline +char mcclellanExec8_i_ni(const struct mcclellan *m, u32 *state, const u8 *buf, + size_t len, u64a offAdj, NfaCallback cb, void *ctxt, + char single, const u8 **final_point, + enum MatchMode mode) { + if (mode == CALLBACK_OUTPUT) { + return mcclellanExec8_i_cb(m, state, buf, len, offAdj, cb, ctxt, single, + final_point); + } else if (mode == STOP_AT_MATCH) { + return mcclellanExec8_i_sam(m, state, buf, len, offAdj, cb, ctxt, + single, final_point); + } else { + assert(mode == NO_MATCHES); + return mcclellanExec8_i_nm(m, state, buf, len, offAdj, cb, ctxt, single, + final_point); + } +} + +static really_inline +char mcclellanCheckEOD(const struct NFA *nfa, u32 s, u64a offset, + NfaCallback cb, void *ctxt) { + const struct mcclellan *m = getImplNfa(nfa); + const struct mstate_aux *aux = get_aux(m, s); + + if (m->has_wide == 1 && s >= m->wide_limit) { + return MO_CONTINUE_MATCHING; + } + + if (!aux->accept_eod) { + return MO_CONTINUE_MATCHING; + } + return doComplexReport(cb, ctxt, m, s, offset, 1, NULL, NULL); +} + +static really_inline +char nfaExecMcClellan16_Q2i(const struct NFA *n, u64a offset, const u8 *buffer, + const u8 *hend, NfaCallback cb, void *context, + struct mq *q, char single, s64a end, + enum MatchMode mode) { + assert(n->type == MCCLELLAN_NFA_16); + const struct mcclellan *m = getImplNfa(n); + s64a sp; + + assert(ISALIGNED_N(q->state, 2)); + u32 s = *(u16 *)q->state; + + if (q->report_current) { + assert(s); + assert(get_aux(m, s)->accept); + + int rv; + if (single) { + DEBUG_PRINTF("reporting %u\n", m->arb_report); + rv = cb(0, q_cur_offset(q), m->arb_report, context); + } else { + u32 cached_accept_id = 0; + u32 cached_accept_state = 0; + + rv = doComplexReport(cb, context, m, s, q_cur_offset(q), 0, + &cached_accept_state, &cached_accept_id); + } + + q->report_current = 0; + + if (rv == MO_HALT_MATCHING) { + return MO_DEAD; + } + } + + sp = q_cur_loc(q); + q->cur++; + + const u8 *cur_buf = sp < 0 ? hend : buffer; + + assert(q->cur); + if (mode != NO_MATCHES && q->items[q->cur - 1].location > end) { + DEBUG_PRINTF("this is as far as we go\n"); + q->cur--; + q->items[q->cur].type = MQE_START; + q->items[q->cur].location = end; + *(u16 *)q->state = s; + return MO_ALIVE; + } + + while (1) { + assert(q->cur < q->end); + s64a ep = q->items[q->cur].location; + if (mode != NO_MATCHES) { + ep = MIN(ep, end); + } + + assert(ep >= sp); + + s64a local_ep = ep; + if (sp < 0) { + local_ep = MIN(0, ep); + } + + /* do main buffer region */ + const u8 *final_look; + char rv = mcclellanExec16_i_ni(m, &s, q->state, cur_buf + sp, + local_ep - sp, offset + sp, cb, context, + single, &final_look, mode); + if (rv == MO_DEAD) { + *(u16 *)q->state = 0; + return MO_DEAD; + } + if (mode == STOP_AT_MATCH && rv == MO_MATCHES_PENDING) { + DEBUG_PRINTF("this is as far as we go\n"); + DEBUG_PRINTF("state %u final_look %zd\n", s, final_look - cur_buf); + + assert(q->cur); + assert(final_look != cur_buf + local_ep); + + q->cur--; + q->items[q->cur].type = MQE_START; + q->items[q->cur].location = final_look - cur_buf + 1; /* due to + * early -1 */ + *(u16 *)q->state = s; + return MO_MATCHES_PENDING; + } + + assert(rv == MO_ALIVE); + assert(q->cur); + if (mode != NO_MATCHES && q->items[q->cur].location > end) { + DEBUG_PRINTF("this is as far as we go\n"); + q->cur--; + q->items[q->cur].type = MQE_START; + q->items[q->cur].location = end; + *(u16 *)q->state = s; + return MO_ALIVE; + } + + sp = local_ep; + + if (sp == 0) { + cur_buf = buffer; + } + + if (sp != ep) { + continue; + } + + switch (q->items[q->cur].type) { + case MQE_TOP: + assert(sp + offset || !s); + if (sp + offset == 0) { + s = m->start_anchored; + break; + } + s = mcclellanEnableStarts(m, s); + break; + case MQE_END: + *(u16 *)q->state = s; + q->cur++; + return s ? MO_ALIVE : MO_DEAD; + default: + assert(!"invalid queue event"); + } + + q->cur++; + } +} + +static really_inline +char nfaExecMcClellan16_Bi(const struct NFA *n, u64a offset, const u8 *buffer, + size_t length, NfaCallback cb, void *context, + char single) { + assert(n->type == MCCLELLAN_NFA_16); + const struct mcclellan *m = getImplNfa(n); + u32 s = m->start_anchored; + + if (mcclellanExec16_i(m, &s, NULL, buffer, length, offset, cb, context, + single, NULL, CALLBACK_OUTPUT) + == MO_DEAD) { + return s ? MO_ALIVE : MO_DEAD; + } + + if (m->has_wide == 1 && s >= m->wide_limit) { + return MO_ALIVE; + } + + const struct mstate_aux *aux = get_aux(m, s); + + if (aux->accept_eod) { + doComplexReport(cb, context, m, s, offset + length, 1, NULL, NULL); + } + + return MO_ALIVE; +} + +static really_inline +char nfaExecMcClellan8_Q2i(const struct NFA *n, u64a offset, const u8 *buffer, + const u8 *hend, NfaCallback cb, void *context, + struct mq *q, char single, s64a end, + enum MatchMode mode) { + assert(n->type == MCCLELLAN_NFA_8); + const struct mcclellan *m = getImplNfa(n); + s64a sp; + + u32 s = *(u8 *)q->state; + + if (q->report_current) { + assert(s); + assert(s >= m->accept_limit_8); + + int rv; + if (single) { + DEBUG_PRINTF("reporting %u\n", m->arb_report); + rv = cb(0, q_cur_offset(q), m->arb_report, context); + } else { + u32 cached_accept_id = 0; + u32 cached_accept_state = 0; + + rv = doComplexReport(cb, context, m, s, q_cur_offset(q), 0, + &cached_accept_state, &cached_accept_id); + } + + q->report_current = 0; + + if (rv == MO_HALT_MATCHING) { + return MO_DEAD; + } + } + + sp = q_cur_loc(q); + q->cur++; + + const u8 *cur_buf = sp < 0 ? hend : buffer; + + if (mode != NO_MATCHES && q->items[q->cur - 1].location > end) { + DEBUG_PRINTF("this is as far as we go\n"); + q->cur--; + q->items[q->cur].type = MQE_START; + q->items[q->cur].location = end; + *(u8 *)q->state = s; + return MO_ALIVE; + } + + while (1) { + DEBUG_PRINTF("%s @ %llu\n", q->items[q->cur].type == MQE_TOP ? "TOP" : + q->items[q->cur].type == MQE_END ? "END" : "???", + q->items[q->cur].location + offset); + assert(q->cur < q->end); + s64a ep = q->items[q->cur].location; + if (mode != NO_MATCHES) { + ep = MIN(ep, end); + } + + assert(ep >= sp); + + s64a local_ep = ep; + if (sp < 0) { + local_ep = MIN(0, ep); + } + + const u8 *final_look; + char rv = mcclellanExec8_i_ni(m, &s, cur_buf + sp, local_ep - sp, + offset + sp, cb, context, single, + &final_look, mode); + + if (rv == MO_HALT_MATCHING) { + *(u8 *)q->state = 0; + return MO_DEAD; + } + if (mode == STOP_AT_MATCH && rv == MO_MATCHES_PENDING) { + DEBUG_PRINTF("this is as far as we go\n"); + DEBUG_PRINTF("state %u final_look %zd\n", s, final_look - cur_buf); + + assert(q->cur); + assert(final_look != cur_buf + local_ep); + + q->cur--; + q->items[q->cur].type = MQE_START; + q->items[q->cur].location = final_look - cur_buf + 1; /* due to + * early -1 */ + *(u8 *)q->state = s; + return MO_MATCHES_PENDING; + } + + assert(rv == MO_ALIVE); + assert(q->cur); + if (mode != NO_MATCHES && q->items[q->cur].location > end) { + DEBUG_PRINTF("this is as far as we go\n"); + assert(q->cur); + q->cur--; + q->items[q->cur].type = MQE_START; + q->items[q->cur].location = end; + *(u8 *)q->state = s; + return MO_ALIVE; + } + + sp = local_ep; + + if (sp == 0) { + cur_buf = buffer; + } + + if (sp != ep) { + continue; + } + + switch (q->items[q->cur].type) { + case MQE_TOP: + assert(sp + offset || !s); + if (sp + offset == 0) { + s = (u8)m->start_anchored; + break; + } + s = mcclellanEnableStarts(m, s); + break; + case MQE_END: + *(u8 *)q->state = s; + q->cur++; + return s ? MO_ALIVE : MO_DEAD; + default: + assert(!"invalid queue event"); + } + + q->cur++; + } +} + +static really_inline +char nfaExecMcClellan8_Bi(const struct NFA *n, u64a offset, const u8 *buffer, + size_t length, NfaCallback cb, void *context, + char single) { + assert(n->type == MCCLELLAN_NFA_8); + const struct mcclellan *m = getImplNfa(n); + u32 s = m->start_anchored; + + if (mcclellanExec8_i(m, &s, buffer, length, offset, cb, context, single, + NULL, CALLBACK_OUTPUT) + == MO_DEAD) { + return MO_DEAD; + } + + const struct mstate_aux *aux = get_aux(m, s); + + if (aux->accept_eod) { + doComplexReport(cb, context, m, s, offset + length, 1, NULL, NULL); + } + + return s ? MO_ALIVE : MO_DEAD; +} + +char nfaExecMcClellan8_B(const struct NFA *n, u64a offset, const u8 *buffer, + size_t length, NfaCallback cb, void *context) { + assert(n->type == MCCLELLAN_NFA_8); + const struct mcclellan *m = getImplNfa(n); + + if (m->flags & MCCLELLAN_FLAG_SINGLE) { + return nfaExecMcClellan8_Bi(n, offset, buffer, length, cb, context, 1); + } else { + return nfaExecMcClellan8_Bi(n, offset, buffer, length, cb, context, 0); + } +} + +char nfaExecMcClellan8_Q(const struct NFA *n, struct mq *q, s64a end) { + u64a offset = q->offset; + const u8 *buffer = q->buffer; + NfaCallback cb = q->cb; + void *context = q->context; + assert(n->type == MCCLELLAN_NFA_8); + const struct mcclellan *m = getImplNfa(n); + const u8 *hend = q->history + q->hlength; + + return nfaExecMcClellan8_Q2i(n, offset, buffer, hend, cb, context, q, + m->flags & MCCLELLAN_FLAG_SINGLE, end, + CALLBACK_OUTPUT); +} + +char nfaExecMcClellan16_B(const struct NFA *n, u64a offset, const u8 *buffer, + size_t length, NfaCallback cb, void *context) { + assert(n->type == MCCLELLAN_NFA_16); + const struct mcclellan *m = getImplNfa(n); + + if (m->flags & MCCLELLAN_FLAG_SINGLE) { + return nfaExecMcClellan16_Bi(n, offset, buffer, length, cb, context, 1); + } else { + return nfaExecMcClellan16_Bi(n, offset, buffer, length, cb, context, 0); + } +} + +char nfaExecMcClellan16_Q(const struct NFA *n, struct mq *q, s64a end) { + u64a offset = q->offset; + const u8 *buffer = q->buffer; + NfaCallback cb = q->cb; + void *context = q->context; + assert(n->type == MCCLELLAN_NFA_16); + const struct mcclellan *m = getImplNfa(n); + const u8 *hend = q->history + q->hlength; + + return nfaExecMcClellan16_Q2i(n, offset, buffer, hend, cb, context, q, + m->flags & MCCLELLAN_FLAG_SINGLE, end, + CALLBACK_OUTPUT); +} + +char nfaExecMcClellan8_reportCurrent(const struct NFA *n, struct mq *q) { + const struct mcclellan *m = getImplNfa(n); + NfaCallback cb = q->cb; + void *ctxt = q->context; + u32 s = *(u8 *)q->state; + u8 single = m->flags & MCCLELLAN_FLAG_SINGLE; + u64a offset = q_cur_offset(q); + assert(q_cur_type(q) == MQE_START); + assert(s); + + if (s >= m->accept_limit_8) { + if (single) { + DEBUG_PRINTF("reporting %u\n", m->arb_report); + cb(0, offset, m->arb_report, ctxt); + } else { + u32 cached_accept_id = 0; + u32 cached_accept_state = 0; + + doComplexReport(cb, ctxt, m, s, offset, 0, &cached_accept_state, + &cached_accept_id); + } + } + + return 0; +} + +char nfaExecMcClellan16_reportCurrent(const struct NFA *n, struct mq *q) { + const struct mcclellan *m = getImplNfa(n); + NfaCallback cb = q->cb; + void *ctxt = q->context; + u32 s = *(u16 *)q->state; + const struct mstate_aux *aux = get_aux(m, s); + u8 single = m->flags & MCCLELLAN_FLAG_SINGLE; + u64a offset = q_cur_offset(q); + assert(q_cur_type(q) == MQE_START); + DEBUG_PRINTF("state %u\n", s); + assert(s); + + if (aux->accept) { + if (single) { + DEBUG_PRINTF("reporting %u\n", m->arb_report); + cb(0, offset, m->arb_report, ctxt); + } else { + u32 cached_accept_id = 0; + u32 cached_accept_state = 0; + + doComplexReport(cb, ctxt, m, s, offset, 0, &cached_accept_state, + &cached_accept_id); + } + } + + return 0; +} + +static +char mcclellanHasAccept(const struct mcclellan *m, const struct mstate_aux *aux, + ReportID report) { + assert(m && aux); + + if (!aux->accept) { + return 0; + } + + const struct report_list *rl = (const struct report_list *) + ((const char *)m + aux->accept - sizeof(struct NFA)); + assert(ISALIGNED_N(rl, 4)); + + DEBUG_PRINTF("report list has %u entries\n", rl->count); + + for (u32 i = 0; i < rl->count; i++) { + if (rl->report[i] == report) { + return 1; + } + } + + return 0; +} + +char nfaExecMcClellan8_inAccept(const struct NFA *n, ReportID report, + struct mq *q) { + assert(n && q); + + const struct mcclellan *m = getImplNfa(n); + u8 s = *(u8 *)q->state; + DEBUG_PRINTF("checking accepts for %hhu\n", s); + if (s < m->accept_limit_8) { + return 0; + } + + return mcclellanHasAccept(m, get_aux(m, s), report); +} + +char nfaExecMcClellan8_inAnyAccept(const struct NFA *n, struct mq *q) { + assert(n && q); + + const struct mcclellan *m = getImplNfa(n); + u8 s = *(u8 *)q->state; + DEBUG_PRINTF("checking accepts for %hhu\n", s); + assert(s < m->accept_limit_8 || get_aux(m, s)->accept); + + return s >= m->accept_limit_8; +} + +char nfaExecMcClellan16_inAccept(const struct NFA *n, ReportID report, + struct mq *q) { + assert(n && q); + + const struct mcclellan *m = getImplNfa(n); + u16 s = *(u16 *)q->state; + DEBUG_PRINTF("checking accepts for %hu\n", s); + + return (m->has_wide == 1 && s >= m->wide_limit) ? + 0 : mcclellanHasAccept(m, get_aux(m, s), report); +} + +char nfaExecMcClellan16_inAnyAccept(const struct NFA *n, struct mq *q) { + assert(n && q); + + const struct mcclellan *m = getImplNfa(n); + u16 s = *(u16 *)q->state; + DEBUG_PRINTF("checking accepts for %hu\n", s); + + return (m->has_wide == 1 && s >= m->wide_limit) ? + 0 : !!get_aux(m, s)->accept; +} + +char nfaExecMcClellan8_Q2(const struct NFA *n, struct mq *q, s64a end) { + u64a offset = q->offset; + const u8 *buffer = q->buffer; + NfaCallback cb = q->cb; + void *context = q->context; + assert(n->type == MCCLELLAN_NFA_8); + const struct mcclellan *m = getImplNfa(n); + const u8 *hend = q->history + q->hlength; + + return nfaExecMcClellan8_Q2i(n, offset, buffer, hend, cb, context, q, + m->flags & MCCLELLAN_FLAG_SINGLE, end, + STOP_AT_MATCH); +} + +char nfaExecMcClellan16_Q2(const struct NFA *n, struct mq *q, s64a end) { + u64a offset = q->offset; + const u8 *buffer = q->buffer; + NfaCallback cb = q->cb; + void *context = q->context; + assert(n->type == MCCLELLAN_NFA_16); + const struct mcclellan *m = getImplNfa(n); + const u8 *hend = q->history + q->hlength; + + return nfaExecMcClellan16_Q2i(n, offset, buffer, hend, cb, context, q, + m->flags & MCCLELLAN_FLAG_SINGLE, end, + STOP_AT_MATCH); +} + +char nfaExecMcClellan8_QR(const struct NFA *n, struct mq *q, ReportID report) { + u64a offset = q->offset; + const u8 *buffer = q->buffer; + NfaCallback cb = q->cb; + void *context = q->context; + assert(n->type == MCCLELLAN_NFA_8); + const struct mcclellan *m = getImplNfa(n); + const u8 *hend = q->history + q->hlength; + + char rv = nfaExecMcClellan8_Q2i(n, offset, buffer, hend, cb, context, q, + m->flags & MCCLELLAN_FLAG_SINGLE, 0 /* end */, + NO_MATCHES); + if (rv && nfaExecMcClellan8_inAccept(n, report, q)) { + return MO_MATCHES_PENDING; + } else { + return rv; + } +} + +char nfaExecMcClellan16_QR(const struct NFA *n, struct mq *q, ReportID report) { + u64a offset = q->offset; + const u8 *buffer = q->buffer; + NfaCallback cb = q->cb; + void *context = q->context; + assert(n->type == MCCLELLAN_NFA_16); + const struct mcclellan *m = getImplNfa(n); + const u8 *hend = q->history + q->hlength; + + char rv = nfaExecMcClellan16_Q2i(n, offset, buffer, hend, cb, context, q, + m->flags & MCCLELLAN_FLAG_SINGLE, + 0 /* end */, NO_MATCHES); + + if (rv && nfaExecMcClellan16_inAccept(n, report, q)) { + return MO_MATCHES_PENDING; + } else { + return rv; + } +} + +char nfaExecMcClellan8_initCompressedState(const struct NFA *nfa, u64a offset, + void *state, UNUSED u8 key) { + const struct mcclellan *m = getImplNfa(nfa); + u8 s = offset ? m->start_floating : m->start_anchored; + if (s) { + *(u8 *)state = s; + return 1; + } + return 0; +} + +char nfaExecMcClellan16_initCompressedState(const struct NFA *nfa, u64a offset, + void *state, UNUSED u8 key) { + const struct mcclellan *m = getImplNfa(nfa); + u16 s = offset ? m->start_floating : m->start_anchored; + + // new byte + if (m->has_wide) { + unaligned_store_u16((u16 *)state + 1, 0); + } + + if (s) { + unaligned_store_u16(state, s); + return 1; + } + return 0; +} + +void nfaExecMcClellan8_SimpStream(const struct NFA *nfa, char *state, + const u8 *buf, char top, size_t start_off, + size_t len, NfaCallback cb, void *ctxt) { + const struct mcclellan *m = getImplNfa(nfa); + + u32 s = top ? m->start_anchored : *(u8 *)state; + + if (m->flags & MCCLELLAN_FLAG_SINGLE) { + mcclellanExec8_i(m, &s, buf + start_off, len - start_off, + start_off, cb, ctxt, 1, NULL, CALLBACK_OUTPUT); + } else { + mcclellanExec8_i(m, &s, buf + start_off, len - start_off, + start_off, cb, ctxt, 0, NULL, CALLBACK_OUTPUT); + } + + *(u8 *)state = s; +} + +void nfaExecMcClellan16_SimpStream(const struct NFA *nfa, char *state, + const u8 *buf, char top, size_t start_off, + size_t len, NfaCallback cb, void *ctxt) { + const struct mcclellan *m = getImplNfa(nfa); + u32 s; + + if (top) { + s = m->start_anchored; + + // new byte + if (m->has_wide) { + unaligned_store_u16((u16 *)state + 1, 0); + } + } else { + s = unaligned_load_u16(state); + } + + if (m->flags & MCCLELLAN_FLAG_SINGLE) { + mcclellanExec16_i(m, &s, state, buf + start_off, len - start_off, + start_off, cb, ctxt, 1, NULL, CALLBACK_OUTPUT); + } else { + mcclellanExec16_i(m, &s, state, buf + start_off, len - start_off, + start_off, cb, ctxt, 0, NULL, CALLBACK_OUTPUT); + } + + unaligned_store_u16(state, s); +} + +char nfaExecMcClellan8_testEOD(const struct NFA *nfa, const char *state, + UNUSED const char *streamState, u64a offset, + NfaCallback callback, void *context) { + return mcclellanCheckEOD(nfa, *(const u8 *)state, offset, callback, + context); +} + +char nfaExecMcClellan16_testEOD(const struct NFA *nfa, const char *state, + UNUSED const char *streamState, u64a offset, + NfaCallback callback, void *context) { + assert(ISALIGNED_N(state, 2)); + return mcclellanCheckEOD(nfa, *(const u16 *)state, offset, callback, + context); +} + +char nfaExecMcClellan8_queueInitState(UNUSED const struct NFA *nfa, + struct mq *q) { + assert(nfa->scratchStateSize == 1); + *(u8 *)q->state = 0; + return 0; +} + +char nfaExecMcClellan16_queueInitState(UNUSED const struct NFA *nfa, + struct mq *q) { + const struct mcclellan *m = getImplNfa(nfa); + assert(m->has_wide == 1 ? nfa->scratchStateSize == 4 + : nfa->scratchStateSize == 2); + assert(ISALIGNED_N(q->state, 2)); + *(u16 *)q->state = 0; + + // new byte + if (m->has_wide) { + unaligned_store_u16((u16 *)q->state + 1, 0); + } + return 0; +} + +char nfaExecMcClellan8_queueCompressState(UNUSED const struct NFA *nfa, + const struct mq *q, UNUSED s64a loc) { + void *dest = q->streamState; + const void *src = q->state; + assert(nfa->scratchStateSize == 1); + assert(nfa->streamStateSize == 1); + *(u8 *)dest = *(const u8 *)src; + return 0; +} + +char nfaExecMcClellan8_expandState(UNUSED const struct NFA *nfa, void *dest, + const void *src, UNUSED u64a offset, + UNUSED u8 key) { + assert(nfa->scratchStateSize == 1); + assert(nfa->streamStateSize == 1); + *(u8 *)dest = *(const u8 *)src; + return 0; +} + +char nfaExecMcClellan16_queueCompressState(UNUSED const struct NFA *nfa, + const struct mq *q, + UNUSED s64a loc) { + const struct mcclellan *m = getImplNfa(nfa); + void *dest = q->streamState; + const void *src = q->state; + assert(m->has_wide == 1 ? nfa->scratchStateSize == 4 + : nfa->scratchStateSize == 2); + assert(m->has_wide == 1 ? nfa->streamStateSize == 4 + : nfa->streamStateSize == 2); + + assert(ISALIGNED_N(src, 2)); + unaligned_store_u16(dest, *(const u16 *)(src)); + + // new byte + if (m->has_wide) { + unaligned_store_u16((u16 *)dest + 1, *((const u16 *)src + 1)); + } + return 0; +} + +char nfaExecMcClellan16_expandState(UNUSED const struct NFA *nfa, void *dest, + const void *src, UNUSED u64a offset, + UNUSED u8 key) { + const struct mcclellan *m = getImplNfa(nfa); + assert(m->has_wide == 1 ? nfa->scratchStateSize == 4 + : nfa->scratchStateSize == 2); + assert(m->has_wide == 1 ? nfa->streamStateSize == 4 + : nfa->streamStateSize == 2); + + assert(ISALIGNED_N(dest, 2)); + *(u16 *)dest = unaligned_load_u16(src); + + // new byte + if (m->has_wide) { + *((u16 *)dest + 1) = unaligned_load_u16((const u16 *)src + 1); + } + return 0; +} diff --git a/regex/nfa/mcclellan.h b/regex/nfa/mcclellan.h new file mode 100644 index 000000000..9c6b3eecb --- /dev/null +++ b/regex/nfa/mcclellan.h @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2015-2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef MCCLELLAN_H +#define MCCLELLAN_H + +#include "callback.h" +#include "ue2common.h" + +struct mq; +struct NFA; + +// 8-bit McClellan + +char nfaExecMcClellan8_testEOD(const struct NFA *nfa, const char *state, + const char *streamState, u64a offset, + NfaCallback callback, void *context); +char nfaExecMcClellan8_Q(const struct NFA *n, struct mq *q, s64a end); +char nfaExecMcClellan8_Q2(const struct NFA *n, struct mq *q, s64a end); +char nfaExecMcClellan8_QR(const struct NFA *n, struct mq *q, ReportID report); +char nfaExecMcClellan8_reportCurrent(const struct NFA *n, struct mq *q); +char nfaExecMcClellan8_inAccept(const struct NFA *n, ReportID report, + struct mq *q); +char nfaExecMcClellan8_inAnyAccept(const struct NFA *n, struct mq *q); +char nfaExecMcClellan8_queueInitState(const struct NFA *n, struct mq *q); +char nfaExecMcClellan8_initCompressedState(const struct NFA *n, u64a offset, + void *state, u8 key); +char nfaExecMcClellan8_queueCompressState(const struct NFA *nfa, + const struct mq *q, s64a loc); +char nfaExecMcClellan8_expandState(const struct NFA *nfa, void *dest, + const void *src, u64a offset, u8 key); + +#define nfaExecMcClellan8_B_Reverse NFA_API_NO_IMPL +#define nfaExecMcClellan8_zombie_status NFA_API_ZOMBIE_NO_IMPL + +// 16-bit McClellan + +char nfaExecMcClellan16_testEOD(const struct NFA *nfa, const char *state, + const char *streamState, u64a offset, + NfaCallback callback, void *context); +char nfaExecMcClellan16_Q(const struct NFA *n, struct mq *q, s64a end); +char nfaExecMcClellan16_Q2(const struct NFA *n, struct mq *q, s64a end); +char nfaExecMcClellan16_QR(const struct NFA *n, struct mq *q, ReportID report); +char nfaExecMcClellan16_reportCurrent(const struct NFA *n, struct mq *q); +char nfaExecMcClellan16_inAccept(const struct NFA *n, ReportID report, + struct mq *q); +char nfaExecMcClellan16_inAnyAccept(const struct NFA *n, struct mq *q); +char nfaExecMcClellan16_queueInitState(const struct NFA *n, struct mq *q); +char nfaExecMcClellan16_initCompressedState(const struct NFA *n, u64a offset, + void *state, u8 key); +char nfaExecMcClellan16_queueCompressState(const struct NFA *nfa, + const struct mq *q, s64a loc); +char nfaExecMcClellan16_expandState(const struct NFA *nfa, void *dest, + const void *src, u64a offset, u8 key); + +#define nfaExecMcClellan16_B_Reverse NFA_API_NO_IMPL +#define nfaExecMcClellan16_zombie_status NFA_API_ZOMBIE_NO_IMPL + +/** + * Simple streaming mode calls: + * - always uses the anchored start state regardless if top is set regardless of + * start_off + * - never checks eod + */ +void nfaExecMcClellan8_SimpStream(const struct NFA *nfa, char *state, + const u8 *buf, char top, size_t start_off, + size_t len, NfaCallback cb, void *ctxt); + +void nfaExecMcClellan16_SimpStream(const struct NFA *nfa, char *state, + const u8 *buf, char top, size_t start_off, + size_t len, NfaCallback cb, void *ctxt); + +/** + * Simple block mode calls: + * - always uses the anchored start state regardless of initial start + */ + +char nfaExecMcClellan8_B(const struct NFA *n, u64a offset, const u8 *buffer, + size_t length, NfaCallback cb, void *context); + +char nfaExecMcClellan16_B(const struct NFA *n, u64a offset, const u8 *buffer, + size_t length, NfaCallback cb, void *context); + +#endif diff --git a/regex/nfa/mcclellan_common_impl.h b/regex/nfa/mcclellan_common_impl.h new file mode 100644 index 000000000..7b0e7f48c --- /dev/null +++ b/regex/nfa/mcclellan_common_impl.h @@ -0,0 +1,189 @@ +/* + * Copyright (c) 2015-2018, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +enum MatchMode { + CALLBACK_OUTPUT, + STOP_AT_MATCH, + NO_MATCHES +}; + +static really_inline +const struct mstate_aux *get_aux(const struct mcclellan *m, u32 s) { + const char *nfa = (const char *)m - sizeof(struct NFA); + const struct mstate_aux *aux + = s + (const struct mstate_aux *)(nfa + m->aux_offset); + + assert(ISALIGNED(aux)); + return aux; +} + +static really_inline +u32 mcclellanEnableStarts(const struct mcclellan *m, u32 s) { + const struct mstate_aux *aux = get_aux(m, s); + + DEBUG_PRINTF("enabling starts %u->%hu\n", s, aux->top); + return aux->top; +} + +static really_inline +u32 doSherman16(const char *sherman_state, u8 cprime, const u16 *succ_table, + u32 as) { + assert(ISALIGNED_N(sherman_state, 16)); + + u8 len = *(const u8 *)(sherman_state + SHERMAN_LEN_OFFSET); + + if (len) { + m128 ss_char = load128(sherman_state); + m128 cur_char = set16x8(cprime); + + u32 z = movemask128(eq128(ss_char, cur_char)); + + /* remove header cruft: type 1, len 1, daddy 2*/ + z &= ~0xf; + z &= (1U << (len + 4)) - 1; + + if (z) { + u32 i = ctz32(z & ~0xf) - 4; + + u32 s_out = unaligned_load_u16((const u8 *)sherman_state + + SHERMAN_STATES_OFFSET(len) + + sizeof(u16) * i); + DEBUG_PRINTF("found sherman match at %u/%u for c'=%hhu s=%u\n", i, + len, cprime, s_out); + return s_out; + } + } + + u32 daddy = *(const u16 *)(sherman_state + SHERMAN_DADDY_OFFSET); + return succ_table[(daddy << as) + cprime]; +} + +static really_inline +u16 doWide16(const char *wide_entry, const u8 **c_inout, const u8 *end, + const u8 *remap, const u16 *s, char *qstate, u16 *offset) { + // Internal relative offset after the last visit of the wide state. + if (qstate != NULL) { // stream mode + *offset = unaligned_load_u16((const u16 *)(qstate + 2)); + } + + u8 successful = 0; + const u8 *c = *c_inout; + u32 len_c = end - c; + + u16 width = *(const u16 *)(wide_entry + WIDE_WIDTH_OFFSET); + assert(width >= 8); + const u8 *symbols = (const u8 *)(wide_entry + WIDE_SYMBOL_OFFSET16); + const u16 *trans = (const u16 *)(wide_entry + + WIDE_TRANSITION_OFFSET16(width)); + + assert(*offset < width); + u16 len_w = width - *offset; + const u8 *sym = symbols + *offset; + + char tmp[16]; + u16 pos = 0; + + if (*offset == 0 && remap[*c] != *sym) { + goto normal; + } + + // both in (16, +oo). + while (len_w >= 16 && len_c >= 16) { + m128 str_w = loadu128(sym); + for (size_t i = 0; i < 16; i++) { + tmp[i] = remap[*(c + i)]; + } + m128 str_c = loadu128(tmp); + + u32 z = movemask128(eq128(str_w, str_c)); + pos = ctz32(~z); + assert(pos <= 16); + + if (pos < 16) { + goto normal; + } + + sym += 16; + c += 16; + len_w -= 16; + len_c -= 16; + } + + pos = 0; + // at least one in (0, 16). + u32 loadLength_w = MIN(len_w, 16); + u32 loadLength_c = MIN(len_c, 16); + m128 str_w = loadbytes128(sym, loadLength_w); + for (size_t i = 0; i < loadLength_c; i++) { + tmp[i] = remap[*(c + i)]; + } + m128 str_c = loadbytes128(tmp, loadLength_c); + + u32 z = movemask128(eq128(str_w, str_c)); + pos = ctz32(~z); + + pos = MIN(pos, MIN(loadLength_w, loadLength_c)); + + if (loadLength_w <= loadLength_c) { + assert(pos <= loadLength_w); + // successful matching. + if (pos == loadLength_w) { + c -= 1; + successful = 1; + } + // failure, do nothing. + } else { + assert(pos <= loadLength_c); + // successful partial matching. + if (pos == loadLength_c) { + c -= 1; + goto partial; + } + // failure, do nothing. + } + +normal: + *offset = 0; + if (qstate != NULL) { + // Internal relative offset. + unaligned_store_u16(qstate + 2, *offset); + } + c += pos; + *c_inout = c; + return successful ? *trans : *(trans + 1 + remap[*c]); + +partial: + *offset = sym - symbols + pos; + if (qstate != NULL) { + // Internal relative offset. + unaligned_store_u16(qstate + 2, *offset); + } + c += pos; + *c_inout = c; + return *s; +} diff --git a/regex/nfa/mcclellan_internal.h b/regex/nfa/mcclellan_internal.h new file mode 100644 index 000000000..482fdb1bc --- /dev/null +++ b/regex/nfa/mcclellan_internal.h @@ -0,0 +1,164 @@ +/* + * Copyright (c) 2015-2018, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef MCCLELLAN_INTERNAL_H +#define MCCLELLAN_INTERNAL_H + +#include "nfa_internal.h" + +#ifdef __cplusplus +extern "C" +{ +#endif + +#define ACCEPT_FLAG 0x8000 +#define ACCEL_FLAG 0x4000 +#define STATE_MASK 0x3fff + +#define SHERMAN_STATE 1 + +#define SHERMAN_TYPE_OFFSET 0 +#define SHERMAN_FIXED_SIZE 32 + +#define SHERMAN_LEN_OFFSET 1 +#define SHERMAN_DADDY_OFFSET 2 +#define SHERMAN_CHARS_OFFSET 4 +#define SHERMAN_STATES_OFFSET(sso_len) (4 + (sso_len)) + +#define WIDE_STATE 2 +#define WIDE_ENTRY_OFFSET8(weo_pos) (2 + (weo_pos)) +#define WIDE_ENTRY_OFFSET16(weo_pos) (4 + (weo_pos)) + +#define WIDE_WIDTH_OFFSET 0 +#define WIDE_SYMBOL_OFFSET8 1 +#define WIDE_TRANSITION_OFFSET8(wto_width) (1 + (wto_width)) +#define WIDE_SYMBOL_OFFSET16 2 +#define WIDE_TRANSITION_OFFSET16(wto_width) (2 + ROUNDUP_N(wto_width, 2)) + +struct report_list { + u32 count; + ReportID report[]; +}; + +struct mstate_aux { + u32 accept; + u32 accept_eod; + u16 top; + u32 accel_offset; /* relative to start of struct mcclellan; 0 if no accel */ +}; + +#define MCCLELLAN_FLAG_SINGLE 1 /**< we raise only single accept id */ + +struct mcclellan { + u16 state_count; /**< total number of states */ + u32 length; /**< length of dfa in bytes */ + u16 start_anchored; /**< anchored start state */ + u16 start_floating; /**< floating start state */ + u32 aux_offset; /**< offset of the aux structures relative to the start of + * the nfa structure */ + u32 sherman_offset; /**< offset of array of sherman state offsets the + * state_info structures relative to the start of the + * nfa structure */ + u32 sherman_end; /**< offset of the end of the state_info structures + * relative to the start of the nfa structure */ + u16 accel_limit_8; /**< 8 bit, lowest accelerable state */ + u16 accept_limit_8; /**< 8 bit, lowest accept state */ + u16 sherman_limit; /**< lowest sherman state */ + u16 wide_limit; /**< 8/16 bit, lowest wide head state */ + u8 alphaShift; + u8 flags; + u8 has_accel; /**< 1 iff there are any accel plans */ + u8 has_wide; /**< 1 iff there exists any wide state */ + u8 remap[256]; /**< remaps characters to a smaller alphabet */ + ReportID arb_report; /**< one of the accepts that this dfa may raise */ + u32 accel_offset; /**< offset of accel structures from start of McClellan */ + u32 haig_offset; /**< reserved for use by Haig, relative to start of NFA */ + u32 wide_offset; /**< offset of the wide state entries to the start of the + * nfa structure */ +}; + +static really_inline +const char *findShermanState(UNUSED const struct mcclellan *m, + const char *sherman_base_offset, u32 sherman_base, + u32 s) { + const char *rv + = sherman_base_offset + SHERMAN_FIXED_SIZE * (s - sherman_base); + assert(rv < (const char *)m + m->length - sizeof(struct NFA)); + UNUSED u8 type = *(const u8 *)(rv + SHERMAN_TYPE_OFFSET); + assert(type == SHERMAN_STATE); + return rv; +} + +static really_inline +char *findMutableShermanState(char *sherman_base_offset, u16 sherman_base, + u32 s) { + return sherman_base_offset + SHERMAN_FIXED_SIZE * (s - sherman_base); +} + +static really_inline +const char *findWideEntry8(UNUSED const struct mcclellan *m, + const char *wide_base, u32 wide_limit, u32 s) { + UNUSED u8 type = *(const u8 *)wide_base; + assert(type == WIDE_STATE); + const u32 entry_offset + = *(const u32 *)(wide_base + + WIDE_ENTRY_OFFSET8((s - wide_limit) * sizeof(u32))); + + const char *rv = wide_base + entry_offset; + assert(rv < (const char *)m + m->length - sizeof(struct NFA)); + return rv; +} + +static really_inline +const char *findWideEntry16(UNUSED const struct mcclellan *m, + const char *wide_base, u32 wide_limit, u32 s) { + UNUSED u8 type = *(const u8 *)wide_base; + assert(type == WIDE_STATE); + const u32 entry_offset + = *(const u32 *)(wide_base + + WIDE_ENTRY_OFFSET16((s - wide_limit) * sizeof(u32))); + + const char *rv = wide_base + entry_offset; + assert(rv < (const char *)m + m->length - sizeof(struct NFA)); + return rv; +} + +static really_inline +char *findMutableWideEntry16(char *wide_base, u32 wide_limit, u32 s) { + u32 entry_offset + = *(const u32 *)(wide_base + + WIDE_ENTRY_OFFSET16((s - wide_limit) * sizeof(u32))); + + return wide_base + entry_offset; +} + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/regex/nfa/mcsheng.c b/regex/nfa/mcsheng.c new file mode 100644 index 000000000..f86acedf5 --- /dev/null +++ b/regex/nfa/mcsheng.c @@ -0,0 +1,2742 @@ +/* + * Copyright (c) 2016-2020, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "mcsheng.h" + +#include "accel.h" +#include "mcsheng_internal.h" +#include "nfa_api.h" +#include "nfa_api_queue.h" +#include "nfa_internal.h" +#include "util/arch.h" +#include "util/bitutils.h" +#include "util/compare.h" +#include "util/simd_utils.h" +#include "ue2common.h" + +enum MatchMode { + CALLBACK_OUTPUT, + STOP_AT_MATCH, + NO_MATCHES +}; + +static really_inline +const struct mstate_aux *get_aux(const struct mcsheng *m, u32 s) { + const char *nfa = (const char *)m - sizeof(struct NFA); + const struct mstate_aux *aux + = s + (const struct mstate_aux *)(nfa + m->aux_offset); + + assert(ISALIGNED(aux)); + return aux; +} + +static really_inline +u32 mcshengEnableStarts(const struct mcsheng *m, u32 s) { + const struct mstate_aux *aux = get_aux(m, s); + + DEBUG_PRINTF("enabling starts %u->%hu\n", s, aux->top); + return aux->top; +} + +static really_inline +u32 doSherman16(const char *sherman_state, u8 cprime, const u16 *succ_table, + u32 as) { + assert(ISALIGNED_N(sherman_state, 16)); + + u8 len = *(const u8 *)(sherman_state + SHERMAN_LEN_OFFSET); + + if (len) { + m128 ss_char = load128(sherman_state); + m128 cur_char = set16x8(cprime); + + u32 z = movemask128(eq128(ss_char, cur_char)); + + /* remove header cruft: type 1, len 1, daddy 2*/ + z &= ~0xf; + z &= (1U << (len + 4)) - 1; + + if (z) { + u32 i = ctz32(z & ~0xf) - 4; + + u32 s_out = unaligned_load_u16((const u8 *)sherman_state + + SHERMAN_STATES_OFFSET(len) + + sizeof(u16) * i); + DEBUG_PRINTF("found sherman match at %u/%u for c'=%hhu s=%u\n", i, + len, cprime, s_out); + return s_out; + } + } + + u32 daddy = *(const u16 *)(sherman_state + SHERMAN_DADDY_OFFSET); + return succ_table[(daddy << as) + cprime]; +} + +static really_inline +char doComplexReport(NfaCallback cb, void *ctxt, const struct mcsheng *m, + u32 s, u64a loc, char eod, u32 *cached_accept_state, + u32 *cached_accept_id) { + DEBUG_PRINTF("reporting state = %u, loc=%llu, eod %hhu\n", + s & STATE_MASK, loc, eod); + + if (!eod && s == *cached_accept_state) { + if (cb(0, loc, *cached_accept_id, ctxt) == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; /* termination requested */ + } + + return MO_CONTINUE_MATCHING; /* continue execution */ + } + + const struct mstate_aux *aux = get_aux(m, s); + size_t offset = eod ? aux->accept_eod : aux->accept; + + assert(offset); + const struct report_list *rl + = (const void *)((const char *)m + offset - sizeof(struct NFA)); + assert(ISALIGNED(rl)); + + DEBUG_PRINTF("report list size %u\n", rl->count); + u32 count = rl->count; + + if (!eod && count == 1) { + *cached_accept_state = s; + *cached_accept_id = rl->report[0]; + + DEBUG_PRINTF("reporting %u\n", rl->report[0]); + if (cb(0, loc, rl->report[0], ctxt) == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; /* termination requested */ + } + + return MO_CONTINUE_MATCHING; /* continue execution */ + } + + for (u32 i = 0; i < count; i++) { + DEBUG_PRINTF("reporting %u\n", rl->report[i]); + if (cb(0, loc, rl->report[i], ctxt) == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; /* termination requested */ + } + } + + return MO_CONTINUE_MATCHING; /* continue execution */ +} + +#define SHENG_CHUNK 8 + +static really_inline +u32 doSheng(const struct mcsheng *m, const u8 **c_inout, const u8 *soft_c_end, + const u8 *hard_c_end, u32 s_in, char do_accel) { + assert(s_in < m->sheng_end); + assert(s_in); /* should not already be dead */ + assert(soft_c_end <= hard_c_end); + DEBUG_PRINTF("s_in = %u (adjusted %u)\n", s_in, s_in - 1); + m128 s = set16x8(s_in - 1); + const u8 *c = *c_inout; + const u8 *c_end = hard_c_end - SHENG_CHUNK + 1; + if (!do_accel) { + c_end = MIN(soft_c_end, hard_c_end - SHENG_CHUNK + 1); + } + const m128 *masks = m->sheng_masks; + u8 sheng_limit = m->sheng_end - 1; /* - 1: no dead state */ + u8 sheng_stop_limit = do_accel ? m->sheng_accel_limit : sheng_limit; + + /* When we use movd to get a u32 containing our state, it will have 4 lanes + * all duplicating the state. We can create versions of our limits with 4 + * copies to directly compare against, this prevents us generating code to + * extract a single copy of the state from the u32 for checking. */ + u32 sheng_stop_limit_x4 = sheng_stop_limit * 0x01010101; + +#if defined(HAVE_BMI2) && defined(ARCH_64_BIT) + u32 sheng_limit_x4 = sheng_limit * 0x01010101; + m128 simd_stop_limit = set4x32(sheng_stop_limit_x4); + m128 accel_delta = set16x8(sheng_limit - sheng_stop_limit); + DEBUG_PRINTF("end %hhu, accel %hu --> limit %hhu\n", sheng_limit, + m->sheng_accel_limit, sheng_stop_limit); +#endif + +#define SHENG_SINGLE_ITER do { \ + m128 shuffle_mask = masks[*(c++)]; \ + s = pshufb_m128(shuffle_mask, s); \ + u32 s_gpr_x4 = movd(s); /* convert to u8 */ \ + DEBUG_PRINTF("c %hhu (%c) --> s %u\n", c[-1], c[-1], s_gpr_x4); \ + if (s_gpr_x4 >= sheng_stop_limit_x4) { \ + s_gpr = s_gpr_x4; \ + goto exit; \ + } \ + } while (0) + + u8 s_gpr; + while (c < c_end) { +#if defined(HAVE_BMI2) && defined(ARCH_64_BIT) + /* This version uses pext for efficiently bitbashing out scaled + * versions of the bytes to process from a u64a */ + + u64a data_bytes = unaligned_load_u64a(c); + u64a cc0 = pdep64(data_bytes, 0xff0); /* extract scaled low byte */ + data_bytes &= ~0xffULL; /* clear low bits for scale space */ + m128 shuffle_mask0 = load128((const char *)masks + cc0); + s = pshufb_m128(shuffle_mask0, s); + m128 s_max = s; + m128 s_max0 = s_max; + DEBUG_PRINTF("c %02llx --> s %u\n", cc0 >> 4, movd(s)); + +#define SHENG_SINGLE_UNROLL_ITER(iter) \ + assert(iter); \ + u64a cc##iter = pext64(data_bytes, mcsheng_pext_mask[iter]); \ + assert(cc##iter == (u64a)c[iter] << 4); \ + m128 shuffle_mask##iter = load128((const char *)masks + cc##iter); \ + s = pshufb_m128(shuffle_mask##iter, s); \ + if (do_accel && iter == 7) { \ + /* in the final iteration we also have to check against accel */ \ + m128 s_temp = sadd_u8_m128(s, accel_delta); \ + s_max = max_u8_m128(s_max, s_temp); \ + } else { \ + s_max = max_u8_m128(s_max, s); \ + } \ + m128 s_max##iter = s_max; \ + DEBUG_PRINTF("c %02llx --> s %u max %u\n", cc##iter >> 4, \ + movd(s), movd(s_max)); + + SHENG_SINGLE_UNROLL_ITER(1); + + SHENG_SINGLE_UNROLL_ITER(2); + SHENG_SINGLE_UNROLL_ITER(3); + + SHENG_SINGLE_UNROLL_ITER(4); + SHENG_SINGLE_UNROLL_ITER(5); + + SHENG_SINGLE_UNROLL_ITER(6); + SHENG_SINGLE_UNROLL_ITER(7); + + if (movd(s_max7) >= sheng_limit_x4) { + DEBUG_PRINTF("exit found\n"); + + /* Explicitly check the last byte as it is more likely as it also + * checks for acceleration. */ + if (movd(s_max6) < sheng_limit_x4) { + c += SHENG_CHUNK; + s_gpr = movq(s); + assert(s_gpr >= sheng_stop_limit); + goto exit; + } + + /* use shift-xor to create a register containing all of the max + * values */ + m128 blended = rshift64_m128(s_max0, 56); + blended = xor128(blended, rshift64_m128(s_max1, 48)); + blended = xor128(blended, rshift64_m128(s_max2, 40)); + blended = xor128(blended, rshift64_m128(s_max3, 32)); + blended = xor128(blended, rshift64_m128(s_max4, 24)); + blended = xor128(blended, rshift64_m128(s_max5, 16)); + blended = xor128(blended, rshift64_m128(s_max6, 8)); + blended = xor128(blended, s); + blended = xor128(blended, rshift64_m128(blended, 8)); + DEBUG_PRINTF("blended %016llx\n", movq(blended)); + + m128 final = min_u8_m128(blended, simd_stop_limit); + m128 cmp = sub_u8_m128(final, simd_stop_limit); + u64a stops = ~movemask128(cmp); + assert(stops); + u32 earliest = ctz32(stops); + DEBUG_PRINTF("stops %02llx, earliest %u\n", stops, earliest); + assert(earliest < 8); + c += earliest + 1; + s_gpr = movq(blended) >> (earliest * 8); + assert(s_gpr >= sheng_stop_limit); + goto exit; + } else { + c += SHENG_CHUNK; + } +#else + SHENG_SINGLE_ITER; + SHENG_SINGLE_ITER; + SHENG_SINGLE_ITER; + SHENG_SINGLE_ITER; + + SHENG_SINGLE_ITER; + SHENG_SINGLE_ITER; + SHENG_SINGLE_ITER; + SHENG_SINGLE_ITER; +#endif + } + + assert(c_end - c < SHENG_CHUNK); + if (c < soft_c_end) { + assert(soft_c_end - c < SHENG_CHUNK); + switch (soft_c_end - c) { + case 7: + SHENG_SINGLE_ITER; + FALLTHROUGH; + case 6: + SHENG_SINGLE_ITER; + FALLTHROUGH; + case 5: + SHENG_SINGLE_ITER; + FALLTHROUGH; + case 4: + SHENG_SINGLE_ITER; + FALLTHROUGH; + case 3: + SHENG_SINGLE_ITER; + FALLTHROUGH; + case 2: + SHENG_SINGLE_ITER; + FALLTHROUGH; + case 1: + SHENG_SINGLE_ITER; + } + } + + assert(c >= soft_c_end); + + s_gpr = movd(s); +exit: + assert(c <= hard_c_end); + DEBUG_PRINTF("%zu from end; s %hhu\n", c_end - c, s_gpr); + assert(c >= soft_c_end || s_gpr >= sheng_stop_limit); + /* undo state adjustment to match mcclellan view */ + if (s_gpr == sheng_limit) { + s_gpr = 0; + } else if (s_gpr < sheng_limit) { + s_gpr++; + } + + *c_inout = c; + return s_gpr; +} + +static really_inline +const char *findShermanState(UNUSED const struct mcsheng *m, + const char *sherman_base_offset, u32 sherman_base, + u32 s) { + const char *rv + = sherman_base_offset + SHERMAN_FIXED_SIZE * (s - sherman_base); + assert(rv < (const char *)m + m->length - sizeof(struct NFA)); + UNUSED u8 type = *(const u8 *)(rv + SHERMAN_TYPE_OFFSET); + assert(type == SHERMAN_STATE); + return rv; +} + +static really_inline +const u8 *run_mcsheng_accel(const struct mcsheng *m, + const struct mstate_aux *aux, u32 s, + const u8 **min_accel_offset, + const u8 *c, const u8 *c_end) { + DEBUG_PRINTF("skipping\n"); + u32 accel_offset = aux[s].accel_offset; + + assert(aux[s].accel_offset); + assert(accel_offset >= m->aux_offset); + assert(!m->sherman_offset || accel_offset < m->sherman_offset); + + const union AccelAux *aaux = (const void *)((const char *)m + accel_offset); + const u8 *c2 = run_accel(aaux, c, c_end); + + if (c2 < *min_accel_offset + BAD_ACCEL_DIST) { + *min_accel_offset = c2 + BIG_ACCEL_PENALTY; + } else { + *min_accel_offset = c2 + SMALL_ACCEL_PENALTY; + } + + if (*min_accel_offset >= c_end - ACCEL_MIN_LEN) { + *min_accel_offset = c_end; + } + + DEBUG_PRINTF("advanced %zd, next accel chance in %zd/%zd\n", + c2 - c, *min_accel_offset - c2, c_end - c2); + + return c2; +} + +static really_inline +u32 doNormal16(const struct mcsheng *m, const u8 **c_inout, const u8 *end, + u32 s, char do_accel, enum MatchMode mode) { + const u8 *c = *c_inout; + + const u16 *succ_table + = (const u16 *)((const char *)m + sizeof(struct mcsheng)); + assert(ISALIGNED_N(succ_table, 2)); + u32 sheng_end = m->sheng_end; + u32 sherman_base = m->sherman_limit; + const char *sherman_base_offset + = (const char *)m - sizeof(struct NFA) + m->sherman_offset; + u32 as = m->alphaShift; + + /* Adjust start of succ table so we can index into using state id (rather + * than adjust to normal id). As we will not be processing states with low + * state ids, we will not be accessing data before the succ table. Note: due + * to the size of the sheng tables, the succ_table pointer will still be + * inside the engine.*/ + succ_table -= sheng_end << as; + + s &= STATE_MASK; + + while (c < end && s >= sheng_end) { + u8 cprime = m->remap[*c]; + DEBUG_PRINTF("c: %02hhx '%c' cp:%02hhx (s=%u)\n", *c, + ourisprint(*c) ? *c : '?', cprime, s); + if (s < sherman_base) { + DEBUG_PRINTF("doing normal\n"); + assert(s < m->state_count); + s = succ_table[(s << as) + cprime]; + } else { + const char *sherman_state + = findShermanState(m, sherman_base_offset, sherman_base, s); + DEBUG_PRINTF("doing sherman (%u)\n", s); + s = doSherman16(sherman_state, cprime, succ_table, as); + } + + DEBUG_PRINTF("s: %u (%u)\n", s, s & STATE_MASK); + c++; + + if (do_accel && (s & ACCEL_FLAG)) { + break; + } + if (mode != NO_MATCHES && (s & ACCEPT_FLAG)) { + break; + } + + s &= STATE_MASK; + } + + *c_inout = c; + return s; +} + +static really_inline +char mcshengExec16_i(const struct mcsheng *m, u32 *state, const u8 *buf, + size_t len, u64a offAdj, NfaCallback cb, void *ctxt, + char single, const u8 **c_final, enum MatchMode mode) { + assert(ISALIGNED_N(state, 2)); + if (!len) { + if (mode == STOP_AT_MATCH) { + *c_final = buf; + } + return MO_ALIVE; + } + + u32 s = *state; + const u8 *c = buf; + const u8 *c_end = buf + len; + const u8 sheng_end = m->sheng_end; + const struct mstate_aux *aux + = (const struct mstate_aux *)((const char *)m + m->aux_offset + - sizeof(struct NFA)); + + s &= STATE_MASK; + + u32 cached_accept_id = 0; + u32 cached_accept_state = 0; + + DEBUG_PRINTF("s: %u, len %zu\n", s, len); + + const u8 *min_accel_offset = c; + if (!m->has_accel || len < ACCEL_MIN_LEN) { + min_accel_offset = c_end; + goto without_accel; + } + + goto with_accel; + +without_accel: + do { + assert(c < min_accel_offset); + int do_accept; + if (!s) { + goto exit; + } else if (s < sheng_end) { + s = doSheng(m, &c, min_accel_offset, c_end, s, 0); + do_accept = mode != NO_MATCHES && get_aux(m, s)->accept; + } else { + s = doNormal16(m, &c, min_accel_offset, s, 0, mode); + + do_accept = mode != NO_MATCHES && (s & ACCEPT_FLAG); + } + + if (do_accept) { + if (mode == STOP_AT_MATCH) { + *state = s & STATE_MASK; + *c_final = c - 1; + return MO_MATCHES_PENDING; + } + + u64a loc = (c - 1) - buf + offAdj + 1; + + if (single) { + DEBUG_PRINTF("reporting %u\n", m->arb_report); + if (cb(0, loc, m->arb_report, ctxt) == MO_HALT_MATCHING) { + return MO_DEAD; /* termination requested */ + } + } else if (doComplexReport(cb, ctxt, m, s & STATE_MASK, loc, 0, + &cached_accept_state, &cached_accept_id) + == MO_HALT_MATCHING) { + return MO_DEAD; + } + } + + assert(c <= c_end); /* sheng is fuzzy for min_accel_offset */ + } while (c < min_accel_offset); + + if (c == c_end) { + goto exit; + } + +with_accel: + do { + assert(c < c_end); + int do_accept; + + if (!s) { + goto exit; + } else if (s < sheng_end) { + if (s > m->sheng_accel_limit) { + c = run_mcsheng_accel(m, aux, s, &min_accel_offset, c, c_end); + if (c == c_end) { + goto exit; + } else { + goto without_accel; + } + } + s = doSheng(m, &c, c_end, c_end, s, 1); + do_accept = mode != NO_MATCHES && get_aux(m, s)->accept; + } else { + if (s & ACCEL_FLAG) { + DEBUG_PRINTF("skipping\n"); + s &= STATE_MASK; + c = run_mcsheng_accel(m, aux, s, &min_accel_offset, c, c_end); + if (c == c_end) { + goto exit; + } else { + goto without_accel; + } + } + + s = doNormal16(m, &c, c_end, s, 1, mode); + do_accept = mode != NO_MATCHES && (s & ACCEPT_FLAG); + } + + if (do_accept) { + if (mode == STOP_AT_MATCH) { + *state = s & STATE_MASK; + *c_final = c - 1; + return MO_MATCHES_PENDING; + } + + u64a loc = (c - 1) - buf + offAdj + 1; + + if (single) { + DEBUG_PRINTF("reporting %u\n", m->arb_report); + if (cb(0, loc, m->arb_report, ctxt) == MO_HALT_MATCHING) { + return MO_DEAD; /* termination requested */ + } + } else if (doComplexReport(cb, ctxt, m, s & STATE_MASK, loc, 0, + &cached_accept_state, &cached_accept_id) + == MO_HALT_MATCHING) { + return MO_DEAD; + } + } + + assert(c <= c_end); + } while (c < c_end); + +exit: + s &= STATE_MASK; + + if (mode == STOP_AT_MATCH) { + *c_final = c_end; + } + *state = s; + + return MO_ALIVE; +} + +static never_inline +char mcshengExec16_i_cb(const struct mcsheng *m, u32 *state, const u8 *buf, + size_t len, u64a offAdj, NfaCallback cb, void *ctxt, + char single, const u8 **final_point) { + return mcshengExec16_i(m, state, buf, len, offAdj, cb, ctxt, single, + final_point, CALLBACK_OUTPUT); +} + +static never_inline +char mcshengExec16_i_sam(const struct mcsheng *m, u32 *state, const u8 *buf, + size_t len, u64a offAdj, NfaCallback cb, void *ctxt, + char single, const u8 **final_point) { + return mcshengExec16_i(m, state, buf, len, offAdj, cb, ctxt, single, + final_point, STOP_AT_MATCH); +} + +static never_inline +char mcshengExec16_i_nm(const struct mcsheng *m, u32 *state, const u8 *buf, + size_t len, u64a offAdj, NfaCallback cb, void *ctxt, + char single, const u8 **final_point) { + return mcshengExec16_i(m, state, buf, len, offAdj, cb, ctxt, single, + final_point, NO_MATCHES); +} + +static really_inline +char mcshengExec16_i_ni(const struct mcsheng *m, u32 *state, const u8 *buf, + size_t len, u64a offAdj, NfaCallback cb, void *ctxt, + char single, const u8 **final_point, + enum MatchMode mode) { + if (mode == CALLBACK_OUTPUT) { + return mcshengExec16_i_cb(m, state, buf, len, offAdj, cb, ctxt, + single, final_point); + } else if (mode == STOP_AT_MATCH) { + return mcshengExec16_i_sam(m, state, buf, len, offAdj, cb, ctxt, + single, final_point); + } else { + assert (mode == NO_MATCHES); + return mcshengExec16_i_nm(m, state, buf, len, offAdj, cb, ctxt, + single, final_point); + } +} + +static really_inline +u32 doNormal8(const struct mcsheng *m, const u8 **c_inout, const u8 *end, u32 s, + char do_accel, enum MatchMode mode) { + const u8 *c = *c_inout; + u32 sheng_end = m->sheng_end; + u32 accel_limit = m->accel_limit_8; + u32 accept_limit = m->accept_limit_8; + + const u32 as = m->alphaShift; + const u8 *succ_table = (const u8 *)((const char *)m + + sizeof(struct mcsheng)); + /* Adjust start of succ table so we can index into using state id (rather + * than adjust to normal id). As we will not be processing states with low + * state ids, we will not be accessing data before the succ table. Note: due + * to the size of the sheng tables, the succ_table pointer will still be + * inside the engine.*/ + succ_table -= sheng_end << as; + + assert(s >= sheng_end); + + while (c < end && s >= sheng_end) { + u8 cprime = m->remap[*c]; + DEBUG_PRINTF("c: %02hhx '%c' cp:%02hhx\n", *c, + ourisprint(*c) ? *c : '?', cprime); + s = succ_table[(s << as) + cprime]; + + DEBUG_PRINTF("s: %u\n", s); + c++; + if (do_accel) { + if (s >= accel_limit) { + break; + } + } else { + if (mode != NO_MATCHES && s >= accept_limit) { + break; + } + } + } + *c_inout = c; + return s; +} + +static really_inline +char mcshengExec8_i(const struct mcsheng *m, u32 *state, const u8 *buf, + size_t len, u64a offAdj, NfaCallback cb, void *ctxt, + char single, const u8 **c_final, enum MatchMode mode) { + if (!len) { + *c_final = buf; + return MO_ALIVE; + } + u32 s = *state; + const u8 *c = buf; + const u8 *c_end = buf + len; + const u8 sheng_end = m->sheng_end; + + const struct mstate_aux *aux + = (const struct mstate_aux *)((const char *)m + m->aux_offset + - sizeof(struct NFA)); + u32 accept_limit = m->accept_limit_8; + + u32 cached_accept_id = 0; + u32 cached_accept_state = 0; + + DEBUG_PRINTF("accel %hu, accept %u\n", m->accel_limit_8, accept_limit); + + DEBUG_PRINTF("s: %u, len %zu\n", s, len); + + const u8 *min_accel_offset = c; + if (!m->has_accel || len < ACCEL_MIN_LEN) { + min_accel_offset = c_end; + goto without_accel; + } + + goto with_accel; + +without_accel: + do { + assert(c < min_accel_offset); + if (!s) { + goto exit; + } else if (s < sheng_end) { + s = doSheng(m, &c, min_accel_offset, c_end, s, 0); + } else { + s = doNormal8(m, &c, min_accel_offset, s, 0, mode); + assert(c <= min_accel_offset); + } + + if (mode != NO_MATCHES && s >= accept_limit) { + if (mode == STOP_AT_MATCH) { + DEBUG_PRINTF("match - pausing\n"); + *state = s; + *c_final = c - 1; + return MO_MATCHES_PENDING; + } + + u64a loc = (c - 1) - buf + offAdj + 1; + if (single) { + DEBUG_PRINTF("reporting %u\n", m->arb_report); + if (cb(0, loc, m->arb_report, ctxt) == MO_HALT_MATCHING) { + return MO_DEAD; + } + } else if (doComplexReport(cb, ctxt, m, s, loc, 0, + &cached_accept_state, &cached_accept_id) + == MO_HALT_MATCHING) { + return MO_DEAD; + } + } + + assert(c <= c_end); /* sheng is fuzzy for min_accel_offset */ + } while (c < min_accel_offset); + + if (c == c_end) { + goto exit; + } + +with_accel: + do { + u32 accel_limit = m->accel_limit_8; + + assert(c < c_end); + if (!s) { + goto exit; + } else if (s < sheng_end) { + if (s > m->sheng_accel_limit) { + c = run_mcsheng_accel(m, aux, s, &min_accel_offset, c, c_end); + if (c == c_end) { + goto exit; + } else { + goto without_accel; + } + } + s = doSheng(m, &c, c_end, c_end, s, 1); + } else { + if (s >= accel_limit && aux[s].accel_offset) { + c = run_mcsheng_accel(m, aux, s, &min_accel_offset, c, c_end); + if (c == c_end) { + goto exit; + } else { + goto without_accel; + } + } + s = doNormal8(m, &c, c_end, s, 1, mode); + } + + if (mode != NO_MATCHES && s >= accept_limit) { + if (mode == STOP_AT_MATCH) { + DEBUG_PRINTF("match - pausing\n"); + *state = s; + *c_final = c - 1; + return MO_MATCHES_PENDING; + } + + u64a loc = (c - 1) - buf + offAdj + 1; + if (single) { + DEBUG_PRINTF("reporting %u\n", m->arb_report); + if (cb(0, loc, m->arb_report, ctxt) == MO_HALT_MATCHING) { + return MO_DEAD; + } + } else if (doComplexReport(cb, ctxt, m, s, loc, 0, + &cached_accept_state, &cached_accept_id) + == MO_HALT_MATCHING) { + return MO_DEAD; + } + } + + assert(c <= c_end); + } while (c < c_end); + +exit: + *state = s; + if (mode == STOP_AT_MATCH) { + *c_final = c_end; + } + return MO_ALIVE; +} + +static never_inline +char mcshengExec8_i_cb(const struct mcsheng *m, u32 *state, const u8 *buf, + size_t len, u64a offAdj, NfaCallback cb, void *ctxt, + char single, const u8 **final_point) { + return mcshengExec8_i(m, state, buf, len, offAdj, cb, ctxt, single, + final_point, CALLBACK_OUTPUT); +} + +static never_inline +char mcshengExec8_i_sam(const struct mcsheng *m, u32 *state, const u8 *buf, + size_t len, u64a offAdj, NfaCallback cb, void *ctxt, + char single, const u8 **final_point) { + return mcshengExec8_i(m, state, buf, len, offAdj, cb, ctxt, single, + final_point, STOP_AT_MATCH); +} + +static never_inline +char mcshengExec8_i_nm(const struct mcsheng *m, u32 *state, const u8 *buf, + size_t len, u64a offAdj, NfaCallback cb, void *ctxt, + char single, const u8 **final_point) { + return mcshengExec8_i(m, state, buf, len, offAdj, cb, ctxt, single, + final_point, NO_MATCHES); +} + +static really_inline +char mcshengExec8_i_ni(const struct mcsheng *m, u32 *state, const u8 *buf, + size_t len, u64a offAdj, NfaCallback cb, void *ctxt, + char single, const u8 **final_point, + enum MatchMode mode) { + if (mode == CALLBACK_OUTPUT) { + return mcshengExec8_i_cb(m, state, buf, len, offAdj, cb, ctxt, single, + final_point); + } else if (mode == STOP_AT_MATCH) { + return mcshengExec8_i_sam(m, state, buf, len, offAdj, cb, ctxt, + single, final_point); + } else { + assert(mode == NO_MATCHES); + return mcshengExec8_i_nm(m, state, buf, len, offAdj, cb, ctxt, single, + final_point); + } +} + +static really_inline +char mcshengCheckEOD(const struct NFA *nfa, u32 s, u64a offset, + NfaCallback cb, void *ctxt) { + const struct mcsheng *m = getImplNfa(nfa); + const struct mstate_aux *aux = get_aux(m, s); + + if (!aux->accept_eod) { + return MO_CONTINUE_MATCHING; + } + return doComplexReport(cb, ctxt, m, s, offset, 1, NULL, NULL); +} + +static really_inline +char nfaExecMcSheng16_Q2i(const struct NFA *n, u64a offset, const u8 *buffer, + const u8 *hend, NfaCallback cb, void *context, + struct mq *q, char single, s64a end, + enum MatchMode mode) { + assert(n->type == MCSHENG_NFA_16); + const struct mcsheng *m = getImplNfa(n); + s64a sp; + + assert(ISALIGNED_N(q->state, 2)); + u32 s = *(u16 *)q->state; + + if (q->report_current) { + assert(s); + assert(get_aux(m, s)->accept); + + int rv; + if (single) { + DEBUG_PRINTF("reporting %u\n", m->arb_report); + rv = cb(0, q_cur_offset(q), m->arb_report, context); + } else { + u32 cached_accept_id = 0; + u32 cached_accept_state = 0; + + rv = doComplexReport(cb, context, m, s, q_cur_offset(q), 0, + &cached_accept_state, &cached_accept_id); + } + + q->report_current = 0; + + if (rv == MO_HALT_MATCHING) { + return MO_DEAD; + } + } + + sp = q_cur_loc(q); + q->cur++; + + const u8 *cur_buf = sp < 0 ? hend : buffer; + + assert(q->cur); + if (mode != NO_MATCHES && q->items[q->cur - 1].location > end) { + DEBUG_PRINTF("this is as far as we go\n"); + q->cur--; + q->items[q->cur].type = MQE_START; + q->items[q->cur].location = end; + *(u16 *)q->state = s; + return MO_ALIVE; + } + + while (1) { + assert(q->cur < q->end); + s64a ep = q->items[q->cur].location; + if (mode != NO_MATCHES) { + ep = MIN(ep, end); + } + + assert(ep >= sp); + + s64a local_ep = ep; + if (sp < 0) { + local_ep = MIN(0, ep); + } + + /* do main buffer region */ + const u8 *final_look; + char rv = mcshengExec16_i_ni(m, &s, cur_buf + sp, local_ep - sp, + offset + sp, cb, context, single, + &final_look, mode); + if (rv == MO_DEAD) { + *(u16 *)q->state = 0; + return MO_DEAD; + } + if (mode == STOP_AT_MATCH && rv == MO_MATCHES_PENDING) { + DEBUG_PRINTF("this is as far as we go\n"); + DEBUG_PRINTF("state %u final_look %zd\n", s, final_look - cur_buf); + + assert(q->cur); + assert(final_look != cur_buf + local_ep); + + q->cur--; + q->items[q->cur].type = MQE_START; + q->items[q->cur].location = final_look - cur_buf + 1; /* due to + * early -1 */ + *(u16 *)q->state = s; + return MO_MATCHES_PENDING; + } + + assert(rv == MO_ALIVE); + assert(q->cur); + if (mode != NO_MATCHES && q->items[q->cur].location > end) { + DEBUG_PRINTF("this is as far as we go\n"); + q->cur--; + q->items[q->cur].type = MQE_START; + q->items[q->cur].location = end; + *(u16 *)q->state = s; + return MO_ALIVE; + } + + sp = local_ep; + + if (sp == 0) { + cur_buf = buffer; + } + + if (sp != ep) { + continue; + } + + switch (q->items[q->cur].type) { + case MQE_TOP: + assert(sp + offset || !s); + if (sp + offset == 0) { + s = m->start_anchored; + break; + } + s = mcshengEnableStarts(m, s); + break; + case MQE_END: + *(u16 *)q->state = s; + q->cur++; + return s ? MO_ALIVE : MO_DEAD; + default: + assert(!"invalid queue event"); + } + + q->cur++; + } +} + +static really_inline +char nfaExecMcSheng8_Q2i(const struct NFA *n, u64a offset, const u8 *buffer, + const u8 *hend, NfaCallback cb, void *context, + struct mq *q, char single, s64a end, + enum MatchMode mode) { + assert(n->type == MCSHENG_NFA_8); + const struct mcsheng *m = getImplNfa(n); + s64a sp; + + u32 s = *(u8 *)q->state; + + if (q->report_current) { + assert(s); + assert(s >= m->accept_limit_8); + + int rv; + if (single) { + DEBUG_PRINTF("reporting %u\n", m->arb_report); + rv = cb(0, q_cur_offset(q), m->arb_report, context); + } else { + u32 cached_accept_id = 0; + u32 cached_accept_state = 0; + + rv = doComplexReport(cb, context, m, s, q_cur_offset(q), 0, + &cached_accept_state, &cached_accept_id); + } + + q->report_current = 0; + + if (rv == MO_HALT_MATCHING) { + return MO_DEAD; + } + } + + sp = q_cur_loc(q); + q->cur++; + + const u8 *cur_buf = sp < 0 ? hend : buffer; + + if (mode != NO_MATCHES && q->items[q->cur - 1].location > end) { + DEBUG_PRINTF("this is as far as we go\n"); + q->cur--; + q->items[q->cur].type = MQE_START; + q->items[q->cur].location = end; + *(u8 *)q->state = s; + return MO_ALIVE; + } + + while (1) { + DEBUG_PRINTF("%s @ %llu\n", q->items[q->cur].type == MQE_TOP ? "TOP" : + q->items[q->cur].type == MQE_END ? "END" : "???", + q->items[q->cur].location + offset); + assert(q->cur < q->end); + s64a ep = q->items[q->cur].location; + if (mode != NO_MATCHES) { + ep = MIN(ep, end); + } + + assert(ep >= sp); + + s64a local_ep = ep; + if (sp < 0) { + local_ep = MIN(0, ep); + } + + const u8 *final_look; + char rv = mcshengExec8_i_ni(m, &s, cur_buf + sp, local_ep - sp, + offset + sp, cb, context, single, + &final_look, mode); + if (rv == MO_HALT_MATCHING) { + *(u8 *)q->state = 0; + return MO_DEAD; + } + if (mode == STOP_AT_MATCH && rv == MO_MATCHES_PENDING) { + DEBUG_PRINTF("this is as far as we go\n"); + DEBUG_PRINTF("state %u final_look %zd\n", s, final_look - cur_buf); + + assert(q->cur); + assert(final_look != cur_buf + local_ep); + + q->cur--; + q->items[q->cur].type = MQE_START; + q->items[q->cur].location = final_look - cur_buf + 1; /* due to + * early -1 */ + *(u8 *)q->state = s; + return MO_MATCHES_PENDING; + } + + assert(rv == MO_ALIVE); + assert(q->cur); + if (mode != NO_MATCHES && q->items[q->cur].location > end) { + DEBUG_PRINTF("this is as far as we go\n"); + assert(q->cur); + q->cur--; + q->items[q->cur].type = MQE_START; + q->items[q->cur].location = end; + *(u8 *)q->state = s; + return MO_ALIVE; + } + + sp = local_ep; + + if (sp == 0) { + cur_buf = buffer; + } + + if (sp != ep) { + continue; + } + + switch (q->items[q->cur].type) { + case MQE_TOP: + assert(sp + offset || !s); + if (sp + offset == 0) { + s = (u8)m->start_anchored; + break; + } + s = mcshengEnableStarts(m, s); + break; + case MQE_END: + *(u8 *)q->state = s; + q->cur++; + return s ? MO_ALIVE : MO_DEAD; + default: + assert(!"invalid queue event"); + } + + q->cur++; + } +} + +char nfaExecMcSheng8_Q(const struct NFA *n, struct mq *q, s64a end) { + u64a offset = q->offset; + const u8 *buffer = q->buffer; + NfaCallback cb = q->cb; + void *context = q->context; + assert(n->type == MCSHENG_NFA_8); + const struct mcsheng *m = getImplNfa(n); + const u8 *hend = q->history + q->hlength; + + return nfaExecMcSheng8_Q2i(n, offset, buffer, hend, cb, context, q, + m->flags & MCSHENG_FLAG_SINGLE, end, + CALLBACK_OUTPUT); +} + +char nfaExecMcSheng16_Q(const struct NFA *n, struct mq *q, s64a end) { + u64a offset = q->offset; + const u8 *buffer = q->buffer; + NfaCallback cb = q->cb; + void *context = q->context; + assert(n->type == MCSHENG_NFA_16); + const struct mcsheng *m = getImplNfa(n); + const u8 *hend = q->history + q->hlength; + + return nfaExecMcSheng16_Q2i(n, offset, buffer, hend, cb, context, q, + m->flags & MCSHENG_FLAG_SINGLE, end, + CALLBACK_OUTPUT); +} + +char nfaExecMcSheng8_reportCurrent(const struct NFA *n, struct mq *q) { + const struct mcsheng *m = getImplNfa(n); + NfaCallback cb = q->cb; + void *ctxt = q->context; + u32 s = *(u8 *)q->state; + u8 single = m->flags & MCSHENG_FLAG_SINGLE; + u64a offset = q_cur_offset(q); + assert(q_cur_type(q) == MQE_START); + assert(s); + + if (s >= m->accept_limit_8) { + if (single) { + DEBUG_PRINTF("reporting %u\n", m->arb_report); + cb(0, offset, m->arb_report, ctxt); + } else { + u32 cached_accept_id = 0; + u32 cached_accept_state = 0; + + doComplexReport(cb, ctxt, m, s, offset, 0, &cached_accept_state, + &cached_accept_id); + } + } + + return 0; +} + +char nfaExecMcSheng16_reportCurrent(const struct NFA *n, struct mq *q) { + const struct mcsheng *m = getImplNfa(n); + NfaCallback cb = q->cb; + void *ctxt = q->context; + u32 s = *(u16 *)q->state; + const struct mstate_aux *aux = get_aux(m, s); + u8 single = m->flags & MCSHENG_FLAG_SINGLE; + u64a offset = q_cur_offset(q); + assert(q_cur_type(q) == MQE_START); + DEBUG_PRINTF("state %u\n", s); + assert(s); + + if (aux->accept) { + if (single) { + DEBUG_PRINTF("reporting %u\n", m->arb_report); + cb(0, offset, m->arb_report, ctxt); + } else { + u32 cached_accept_id = 0; + u32 cached_accept_state = 0; + + doComplexReport(cb, ctxt, m, s, offset, 0, &cached_accept_state, + &cached_accept_id); + } + } + + return 0; +} + +static +char mcshengHasAccept(const struct mcsheng *m, const struct mstate_aux *aux, + ReportID report) { + assert(m && aux); + + if (!aux->accept) { + return 0; + } + + const struct report_list *rl = (const struct report_list *) + ((const char *)m + aux->accept - sizeof(struct NFA)); + assert(ISALIGNED_N(rl, 4)); + + DEBUG_PRINTF("report list has %u entries\n", rl->count); + + for (u32 i = 0; i < rl->count; i++) { + if (rl->report[i] == report) { + return 1; + } + } + + return 0; +} + +char nfaExecMcSheng8_inAccept(const struct NFA *n, ReportID report, + struct mq *q) { + assert(n && q); + + const struct mcsheng *m = getImplNfa(n); + u8 s = *(u8 *)q->state; + DEBUG_PRINTF("checking accepts for %hhu\n", s); + + return mcshengHasAccept(m, get_aux(m, s), report); +} + +char nfaExecMcSheng8_inAnyAccept(const struct NFA *n, struct mq *q) { + assert(n && q); + + const struct mcsheng *m = getImplNfa(n); + u8 s = *(u8 *)q->state; + DEBUG_PRINTF("checking accepts for %hhu\n", s); + + return !!get_aux(m, s)->accept; +} + +char nfaExecMcSheng16_inAccept(const struct NFA *n, ReportID report, + struct mq *q) { + assert(n && q); + + const struct mcsheng *m = getImplNfa(n); + u16 s = *(u16 *)q->state; + DEBUG_PRINTF("checking accepts for %hu\n", s); + + return mcshengHasAccept(m, get_aux(m, s), report); +} + +char nfaExecMcSheng16_inAnyAccept(const struct NFA *n, struct mq *q) { + assert(n && q); + + const struct mcsheng *m = getImplNfa(n); + u16 s = *(u16 *)q->state; + DEBUG_PRINTF("checking accepts for %hu\n", s); + + return !!get_aux(m, s)->accept; +} + +char nfaExecMcSheng8_Q2(const struct NFA *n, struct mq *q, s64a end) { + u64a offset = q->offset; + const u8 *buffer = q->buffer; + NfaCallback cb = q->cb; + void *context = q->context; + assert(n->type == MCSHENG_NFA_8); + const struct mcsheng *m = getImplNfa(n); + const u8 *hend = q->history + q->hlength; + + return nfaExecMcSheng8_Q2i(n, offset, buffer, hend, cb, context, q, + m->flags & MCSHENG_FLAG_SINGLE, end, + STOP_AT_MATCH); +} + +char nfaExecMcSheng16_Q2(const struct NFA *n, struct mq *q, s64a end) { + u64a offset = q->offset; + const u8 *buffer = q->buffer; + NfaCallback cb = q->cb; + void *context = q->context; + assert(n->type == MCSHENG_NFA_16); + const struct mcsheng *m = getImplNfa(n); + const u8 *hend = q->history + q->hlength; + + return nfaExecMcSheng16_Q2i(n, offset, buffer, hend, cb, context, q, + m->flags & MCSHENG_FLAG_SINGLE, end, + STOP_AT_MATCH); +} + +char nfaExecMcSheng8_QR(const struct NFA *n, struct mq *q, ReportID report) { + u64a offset = q->offset; + const u8 *buffer = q->buffer; + NfaCallback cb = q->cb; + void *context = q->context; + assert(n->type == MCSHENG_NFA_8); + const struct mcsheng *m = getImplNfa(n); + const u8 *hend = q->history + q->hlength; + + char rv = nfaExecMcSheng8_Q2i(n, offset, buffer, hend, cb, context, q, + m->flags & MCSHENG_FLAG_SINGLE, 0 /* end */, + NO_MATCHES); + if (rv && nfaExecMcSheng8_inAccept(n, report, q)) { + return MO_MATCHES_PENDING; + } else { + return rv; + } +} + +char nfaExecMcSheng16_QR(const struct NFA *n, struct mq *q, ReportID report) { + u64a offset = q->offset; + const u8 *buffer = q->buffer; + NfaCallback cb = q->cb; + void *context = q->context; + assert(n->type == MCSHENG_NFA_16); + const struct mcsheng *m = getImplNfa(n); + const u8 *hend = q->history + q->hlength; + + char rv = nfaExecMcSheng16_Q2i(n, offset, buffer, hend, cb, context, q, + m->flags & MCSHENG_FLAG_SINGLE, 0 /* end */, + NO_MATCHES); + + if (rv && nfaExecMcSheng16_inAccept(n, report, q)) { + return MO_MATCHES_PENDING; + } else { + return rv; + } +} + +char nfaExecMcSheng8_initCompressedState(const struct NFA *nfa, u64a offset, + void *state, UNUSED u8 key) { + const struct mcsheng *m = getImplNfa(nfa); + u8 s = offset ? m->start_floating : m->start_anchored; + if (s) { + *(u8 *)state = s; + return 1; + } + return 0; +} + +char nfaExecMcSheng16_initCompressedState(const struct NFA *nfa, u64a offset, + void *state, UNUSED u8 key) { + const struct mcsheng *m = getImplNfa(nfa); + u16 s = offset ? m->start_floating : m->start_anchored; + if (s) { + unaligned_store_u16(state, s); + return 1; + } + return 0; +} + +char nfaExecMcSheng8_testEOD(const struct NFA *nfa, const char *state, + UNUSED const char *streamState, u64a offset, + NfaCallback callback, void *context) { + return mcshengCheckEOD(nfa, *(const u8 *)state, offset, callback, + context); +} + +char nfaExecMcSheng16_testEOD(const struct NFA *nfa, const char *state, + UNUSED const char *streamState, u64a offset, + NfaCallback callback, void *context) { + assert(ISALIGNED_N(state, 2)); + return mcshengCheckEOD(nfa, *(const u16 *)state, offset, callback, + context); +} + +char nfaExecMcSheng8_queueInitState(UNUSED const struct NFA *nfa, struct mq *q) { + assert(nfa->scratchStateSize == 1); + *(u8 *)q->state = 0; + return 0; +} + +char nfaExecMcSheng16_queueInitState(UNUSED const struct NFA *nfa, struct mq *q) { + assert(nfa->scratchStateSize == 2); + assert(ISALIGNED_N(q->state, 2)); + *(u16 *)q->state = 0; + return 0; +} + +char nfaExecMcSheng8_queueCompressState(UNUSED const struct NFA *nfa, + const struct mq *q, UNUSED s64a loc) { + void *dest = q->streamState; + const void *src = q->state; + assert(nfa->scratchStateSize == 1); + assert(nfa->streamStateSize == 1); + *(u8 *)dest = *(const u8 *)src; + return 0; +} + +char nfaExecMcSheng8_expandState(UNUSED const struct NFA *nfa, void *dest, + const void *src, UNUSED u64a offset, + UNUSED u8 key) { + assert(nfa->scratchStateSize == 1); + assert(nfa->streamStateSize == 1); + *(u8 *)dest = *(const u8 *)src; + return 0; +} + +char nfaExecMcSheng16_queueCompressState(UNUSED const struct NFA *nfa, + const struct mq *q, + UNUSED s64a loc) { + void *dest = q->streamState; + const void *src = q->state; + assert(nfa->scratchStateSize == 2); + assert(nfa->streamStateSize == 2); + assert(ISALIGNED_N(src, 2)); + unaligned_store_u16(dest, *(const u16 *)(src)); + return 0; +} + +char nfaExecMcSheng16_expandState(UNUSED const struct NFA *nfa, void *dest, + const void *src, UNUSED u64a offset, + UNUSED u8 key) { + assert(nfa->scratchStateSize == 2); + assert(nfa->streamStateSize == 2); + assert(ISALIGNED_N(dest, 2)); + *(u16 *)dest = unaligned_load_u16(src); + return 0; +} + +#if defined(HAVE_AVX512VBMI) +static really_inline +const struct mstate_aux *get_aux64(const struct mcsheng64 *m, u32 s) { + const char *nfa = (const char *)m - sizeof(struct NFA); + const struct mstate_aux *aux + = s + (const struct mstate_aux *)(nfa + m->aux_offset); + + assert(ISALIGNED(aux)); + return aux; +} + +static really_inline +u32 mcshengEnableStarts64(const struct mcsheng64 *m, u32 s) { + const struct mstate_aux *aux = get_aux64(m, s); + + DEBUG_PRINTF("enabling starts %u->%hu\n", s, aux->top); + return aux->top; +} + +static really_inline +char doComplexReport64(NfaCallback cb, void *ctxt, const struct mcsheng64 *m, + u32 s, u64a loc, char eod, u32 *cached_accept_state, + u32 *cached_accept_id) { + DEBUG_PRINTF("reporting state = %u, loc=%llu, eod %hhu\n", + s & STATE_MASK, loc, eod); + + if (!eod && s == *cached_accept_state) { + if (cb(0, loc, *cached_accept_id, ctxt) == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; /* termination requested */ + } + + return MO_CONTINUE_MATCHING; /* continue execution */ + } + + const struct mstate_aux *aux = get_aux64(m, s); + size_t offset = eod ? aux->accept_eod : aux->accept; + + assert(offset); + const struct report_list *rl + = (const void *)((const char *)m + offset - sizeof(struct NFA)); + assert(ISALIGNED(rl)); + + DEBUG_PRINTF("report list size %u\n", rl->count); + u32 count = rl->count; + + if (!eod && count == 1) { + *cached_accept_state = s; + *cached_accept_id = rl->report[0]; + + DEBUG_PRINTF("reporting %u\n", rl->report[0]); + if (cb(0, loc, rl->report[0], ctxt) == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; /* termination requested */ + } + + return MO_CONTINUE_MATCHING; /* continue execution */ + } + + for (u32 i = 0; i < count; i++) { + DEBUG_PRINTF("reporting %u\n", rl->report[i]); + if (cb(0, loc, rl->report[i], ctxt) == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; /* termination requested */ + } + } + + return MO_CONTINUE_MATCHING; /* continue execution */ +} + +static really_inline +u32 doSheng64(const struct mcsheng64 *m, const u8 **c_inout, const u8 *soft_c_end, + const u8 *hard_c_end, u32 s_in, char do_accel) { + assert(s_in < m->sheng_end); + assert(s_in); /* should not already be dead */ + assert(soft_c_end <= hard_c_end); + DEBUG_PRINTF("s_in = %u (adjusted %u)\n", s_in, s_in - 1); + m512 s = set64x8(s_in - 1); + const u8 *c = *c_inout; + const u8 *c_end = hard_c_end - SHENG_CHUNK + 1; + if (!do_accel) { + c_end = MIN(soft_c_end, hard_c_end - SHENG_CHUNK + 1); + } + + const m512 *masks = m->sheng_succ_masks; + u8 sheng_limit = m->sheng_end - 1; /* - 1: no dead state */ + u8 sheng_stop_limit = do_accel ? m->sheng_accel_limit : sheng_limit; + + /* When we use movd to get a u32 containing our state, it will have 4 lanes + * all duplicating the state. We can create versions of our limits with 4 + * copies to directly compare against, this prevents us generating code to + * extract a single copy of the state from the u32 for checking. */ + u32 sheng_stop_limit_x4 = sheng_stop_limit * 0x01010101; + +#if defined(HAVE_BMI2) && defined(ARCH_64_BIT) + u32 sheng_limit_x4 = sheng_limit * 0x01010101; + m512 simd_stop_limit = set16x32(sheng_stop_limit_x4); + m512 accel_delta = set64x8(sheng_limit - sheng_stop_limit); + DEBUG_PRINTF("end %hhu, accel %hu --> limit %hhu\n", sheng_limit, + m->sheng_accel_limit, sheng_stop_limit); +#endif + +#define SHENG64_SINGLE_ITER do { \ + m512 succ_mask = masks[*(c++)]; \ + s = vpermb512(s, succ_mask); \ + u32 s_gpr_x4 = movd512(s); /* convert to u8 */ \ + DEBUG_PRINTF("c %hhu (%c) --> s %u\n", c[-1], c[-1], s_gpr_x4); \ + if (s_gpr_x4 >= sheng_stop_limit_x4) { \ + s_gpr = s_gpr_x4; \ + goto exit; \ + } \ + } while (0) + + u8 s_gpr; + while (c < c_end) { +#if defined(HAVE_BMI2) && defined(ARCH_64_BIT) + /* This version uses pext for efficiently bitbashing out scaled + * versions of the bytes to process from a u64a */ + + u64a data_bytes = unaligned_load_u64a(c); + u64a cc0 = pdep64(data_bytes, 0x3fc0); /* extract scaled low byte */ + data_bytes &= ~0xffULL; /* clear low bits for scale space */ + + m512 succ_mask0 = load512((const char *)masks + cc0); + s = vpermb512(s, succ_mask0); + m512 s_max = s; + m512 s_max0 = s_max; + DEBUG_PRINTF("c %02llx --> s %u\n", cc0 >> 6, movd512(s)); + +#define SHENG64_SINGLE_UNROLL_ITER(iter) \ + assert(iter); \ + u64a cc##iter = pext64(data_bytes, mcsheng64_pext_mask[iter]); \ + assert(cc##iter == (u64a)c[iter] << 6); \ + m512 succ_mask##iter = load512((const char *)masks + cc##iter); \ + s = vpermb512(s, succ_mask##iter); \ + if (do_accel && iter == 7) { \ + /* in the final iteration we also have to check against accel */ \ + m512 s_temp = sadd_u8_m512(s, accel_delta); \ + s_max = max_u8_m512(s_max, s_temp); \ + } else { \ + s_max = max_u8_m512(s_max, s); \ + } \ + m512 s_max##iter = s_max; \ + DEBUG_PRINTF("c %02llx --> s %u max %u\n", cc##iter >> 6, \ + movd512(s), movd512(s_max)); + + SHENG64_SINGLE_UNROLL_ITER(1); + SHENG64_SINGLE_UNROLL_ITER(2); + SHENG64_SINGLE_UNROLL_ITER(3); + SHENG64_SINGLE_UNROLL_ITER(4); + SHENG64_SINGLE_UNROLL_ITER(5); + SHENG64_SINGLE_UNROLL_ITER(6); + SHENG64_SINGLE_UNROLL_ITER(7); + + if (movd512(s_max7) >= sheng_limit_x4) { + DEBUG_PRINTF("exit found\n"); + + /* Explicitly check the last byte as it is more likely as it also + * checks for acceleration. */ + if (movd512(s_max6) < sheng_limit_x4) { + c += SHENG_CHUNK; + s_gpr = movq512(s); + assert(s_gpr >= sheng_stop_limit); + goto exit; + } + + /* use shift-xor to create a register containing all of the max + * values */ + m512 blended = rshift64_m512(s_max0, 56); + blended = xor512(blended, rshift64_m512(s_max1, 48)); + blended = xor512(blended, rshift64_m512(s_max2, 40)); + blended = xor512(blended, rshift64_m512(s_max3, 32)); + blended = xor512(blended, rshift64_m512(s_max4, 24)); + blended = xor512(blended, rshift64_m512(s_max5, 16)); + blended = xor512(blended, rshift64_m512(s_max6, 8)); + blended = xor512(blended, s); + blended = xor512(blended, rshift64_m512(blended, 8)); + DEBUG_PRINTF("blended %016llx\n", movq512(blended)); + + m512 final = min_u8_m512(blended, simd_stop_limit); + m512 cmp = sub_u8_m512(final, simd_stop_limit); + m128 tmp = cast512to128(cmp); + u64a stops = ~movemask128(tmp); + assert(stops); + u32 earliest = ctz32(stops); + DEBUG_PRINTF("stops %02llx, earliest %u\n", stops, earliest); + assert(earliest < 8); + c += earliest + 1; + s_gpr = movq512(blended) >> (earliest * 8); + assert(s_gpr >= sheng_stop_limit); + goto exit; + } else { + c += SHENG_CHUNK; + } +#else + SHENG64_SINGLE_ITER; + SHENG64_SINGLE_ITER; + SHENG64_SINGLE_ITER; + SHENG64_SINGLE_ITER; + + SHENG64_SINGLE_ITER; + SHENG64_SINGLE_ITER; + SHENG64_SINGLE_ITER; + SHENG64_SINGLE_ITER; +#endif + } + + assert(c_end - c < SHENG_CHUNK); + if (c < soft_c_end) { + assert(soft_c_end - c < SHENG_CHUNK); + switch (soft_c_end - c) { + case 7: + SHENG64_SINGLE_ITER; // fallthrough + case 6: + SHENG64_SINGLE_ITER; // fallthrough + case 5: + SHENG64_SINGLE_ITER; // fallthrough + case 4: + SHENG64_SINGLE_ITER; // fallthrough + case 3: + SHENG64_SINGLE_ITER; // fallthrough + case 2: + SHENG64_SINGLE_ITER; // fallthrough + case 1: + SHENG64_SINGLE_ITER; // fallthrough + } + } + + assert(c >= soft_c_end); + + s_gpr = movq512(s); +exit: + assert(c <= hard_c_end); + DEBUG_PRINTF("%zu from end; s %hhu\n", c_end - c, s_gpr); + assert(c >= soft_c_end || s_gpr >= sheng_stop_limit); + /* undo state adjustment to match mcclellan view */ + if (s_gpr == sheng_limit) { + s_gpr = 0; + } else if (s_gpr < sheng_limit) { + s_gpr++; + } + + *c_inout = c; + return s_gpr; +} + +static really_inline +const char *findShermanState64(UNUSED const struct mcsheng64 *m, + const char *sherman_base_offset, + u32 sherman_base, u32 s) { + const char *rv + = sherman_base_offset + SHERMAN_FIXED_SIZE * (s - sherman_base); + assert(rv < (const char *)m + m->length - sizeof(struct NFA)); + UNUSED u8 type = *(const u8 *)(rv + SHERMAN_TYPE_OFFSET); + assert(type == SHERMAN_STATE); + return rv; +} + +static really_inline +const u8 *run_mcsheng_accel64(const struct mcsheng64 *m, + const struct mstate_aux *aux, u32 s, + const u8 **min_accel_offset, + const u8 *c, const u8 *c_end) { + DEBUG_PRINTF("skipping\n"); + u32 accel_offset = aux[s].accel_offset; + + assert(aux[s].accel_offset); + assert(accel_offset >= m->aux_offset); + assert(!m->sherman_offset || accel_offset < m->sherman_offset); + + const union AccelAux *aaux = (const void *)((const char *)m + accel_offset); + const u8 *c2 = run_accel(aaux, c, c_end); + + if (c2 < *min_accel_offset + BAD_ACCEL_DIST) { + *min_accel_offset = c2 + BIG_ACCEL_PENALTY; + } else { + *min_accel_offset = c2 + SMALL_ACCEL_PENALTY; + } + + if (*min_accel_offset >= c_end - ACCEL_MIN_LEN) { + *min_accel_offset = c_end; + } + + DEBUG_PRINTF("advanced %zd, next accel chance in %zd/%zd\n", + c2 - c, *min_accel_offset - c2, c_end - c2); + + return c2; +} + +static really_inline +u32 doNormal64_16(const struct mcsheng64 *m, const u8 **c_inout, const u8 *end, + u32 s, char do_accel, enum MatchMode mode) { + const u8 *c = *c_inout; + const u16 *succ_table + = (const u16 *)((const char *)m + sizeof(struct mcsheng64)); + assert(ISALIGNED_N(succ_table, 2)); + u32 sheng_end = m->sheng_end; + u32 sherman_base = m->sherman_limit; + const char *sherman_base_offset + = (const char *)m - sizeof(struct NFA) + m->sherman_offset; + u32 as = m->alphaShift; + + /* Adjust start of succ table so we can index into using state id (rather + * than adjust to normal id). As we will not be processing states with low + * state ids, we will not be accessing data before the succ table. Note: due + * to the size of the sheng tables, the succ_table pointer will still be + * inside the engine.*/ + succ_table -= sheng_end << as; + s &= STATE_MASK; + while (c < end && s >= sheng_end) { + u8 cprime = m->remap[*c]; + DEBUG_PRINTF("c: %02hhx '%c' cp:%02hhx (s=%u)\n", *c, + ourisprint(*c) ? *c : '?', cprime, s); + if (s < sherman_base) { + DEBUG_PRINTF("doing normal\n"); + assert(s < m->state_count); + s = succ_table[(s << as) + cprime]; + } else { + const char *sherman_state + = findShermanState64(m, sherman_base_offset, sherman_base, s); + DEBUG_PRINTF("doing sherman (%u)\n", s); + s = doSherman16(sherman_state, cprime, succ_table, as); + } + + DEBUG_PRINTF("s: %u (%u)\n", s, s & STATE_MASK); + c++; + + if (do_accel && (s & ACCEL_FLAG)) { + break; + } + if (mode != NO_MATCHES && (s & ACCEPT_FLAG)) { + break; + } + + s &= STATE_MASK; + } + + *c_inout = c; + return s; +} + +static really_inline +char mcsheng64Exec16_i(const struct mcsheng64 *m, u32 *state, const u8 *buf, + size_t len, u64a offAdj, NfaCallback cb, void *ctxt, + char single, const u8 **c_final, enum MatchMode mode) { + assert(ISALIGNED_N(state, 2)); + if (!len) { + if (mode == STOP_AT_MATCH) { + *c_final = buf; + } + return MO_ALIVE; + } + + u32 s = *state; + const u8 *c = buf; + const u8 *c_end = buf + len; + const u8 sheng_end = m->sheng_end; + const struct mstate_aux *aux + = (const struct mstate_aux *)((const char *)m + m->aux_offset + - sizeof(struct NFA)); + + s &= STATE_MASK; + + u32 cached_accept_id = 0; + u32 cached_accept_state = 0; + + DEBUG_PRINTF("s: %u, len %zu\n", s, len); + + const u8 *min_accel_offset = c; + if (!m->has_accel || len < ACCEL_MIN_LEN) { + min_accel_offset = c_end; + goto without_accel; + } + + goto with_accel; + +without_accel: + do { + assert(c < min_accel_offset); + int do_accept; + if (!s) { + goto exit; + } else if (s < sheng_end) { + s = doSheng64(m, &c, min_accel_offset, c_end, s, 0); + do_accept = mode != NO_MATCHES && get_aux64(m, s)->accept; + } else { + s = doNormal64_16(m, &c, min_accel_offset, s, 0, mode); + + do_accept = mode != NO_MATCHES && (s & ACCEPT_FLAG); + } + + if (do_accept) { + if (mode == STOP_AT_MATCH) { + *state = s & STATE_MASK; + *c_final = c - 1; + return MO_MATCHES_PENDING; + } + + u64a loc = (c - 1) - buf + offAdj + 1; + + if (single) { + DEBUG_PRINTF("reporting %u\n", m->arb_report); + if (cb(0, loc, m->arb_report, ctxt) == MO_HALT_MATCHING) { + return MO_DEAD; /* termination requested */ + } + } else if (doComplexReport64(cb, ctxt, m, s & STATE_MASK, loc, 0, + &cached_accept_state, + &cached_accept_id) + == MO_HALT_MATCHING) { + return MO_DEAD; + } + } + + assert(c <= c_end); /* sheng is fuzzy for min_accel_offset */ + } while (c < min_accel_offset); + + if (c == c_end) { + goto exit; + } + +with_accel: + do { + assert(c < c_end); + int do_accept; + + if (!s) { + goto exit; + } else if (s < sheng_end) { + if (s > m->sheng_accel_limit) { + c = run_mcsheng_accel64(m, aux, s, &min_accel_offset, c, c_end); + if (c == c_end) { + goto exit; + } else { + goto without_accel; + } + } + s = doSheng64(m, &c, c_end, c_end, s, 1); + do_accept = mode != NO_MATCHES && get_aux64(m, s)->accept; + } else { + if (s & ACCEL_FLAG) { + DEBUG_PRINTF("skipping\n"); + s &= STATE_MASK; + c = run_mcsheng_accel64(m, aux, s, &min_accel_offset, c, c_end); + if (c == c_end) { + goto exit; + } else { + goto without_accel; + } + } + + s = doNormal64_16(m, &c, c_end, s, 1, mode); + do_accept = mode != NO_MATCHES && (s & ACCEPT_FLAG); + } + + if (do_accept) { + if (mode == STOP_AT_MATCH) { + *state = s & STATE_MASK; + *c_final = c - 1; + return MO_MATCHES_PENDING; + } + + u64a loc = (c - 1) - buf + offAdj + 1; + + if (single) { + DEBUG_PRINTF("reporting %u\n", m->arb_report); + if (cb(0, loc, m->arb_report, ctxt) == MO_HALT_MATCHING) { + return MO_DEAD; /* termination requested */ + } + } else if (doComplexReport64(cb, ctxt, m, s & STATE_MASK, loc, 0, + &cached_accept_state, + &cached_accept_id) + == MO_HALT_MATCHING) { + return MO_DEAD; + } + } + + assert(c <= c_end); + } while (c < c_end); + +exit: + s &= STATE_MASK; + + if (mode == STOP_AT_MATCH) { + *c_final = c_end; + } + *state = s; + + return MO_ALIVE; +} + +static never_inline +char mcsheng64Exec16_i_cb(const struct mcsheng64 *m, u32 *state, const u8 *buf, + size_t len, u64a offAdj, NfaCallback cb, void *ctxt, + char single, const u8 **final_point) { + return mcsheng64Exec16_i(m, state, buf, len, offAdj, cb, ctxt, single, + final_point, CALLBACK_OUTPUT); +} + +static never_inline +char mcsheng64Exec16_i_sam(const struct mcsheng64 *m, u32 *state, const u8 *buf, + size_t len, u64a offAdj, NfaCallback cb, void *ctxt, + char single, const u8 **final_point) { + return mcsheng64Exec16_i(m, state, buf, len, offAdj, cb, ctxt, single, + final_point, STOP_AT_MATCH); +} + +static never_inline +char mcsheng64Exec16_i_nm(const struct mcsheng64 *m, u32 *state, const u8 *buf, + size_t len, u64a offAdj, NfaCallback cb, void *ctxt, + char single, const u8 **final_point) { + return mcsheng64Exec16_i(m, state, buf, len, offAdj, cb, ctxt, single, + final_point, NO_MATCHES); +} + +static really_inline +char mcsheng64Exec16_i_ni(const struct mcsheng64 *m, u32 *state, const u8 *buf, + size_t len, u64a offAdj, NfaCallback cb, void *ctxt, + char single, const u8 **final_point, + enum MatchMode mode) { + if (mode == CALLBACK_OUTPUT) { + return mcsheng64Exec16_i_cb(m, state, buf, len, offAdj, cb, ctxt, + single, final_point); + } else if (mode == STOP_AT_MATCH) { + return mcsheng64Exec16_i_sam(m, state, buf, len, offAdj, cb, ctxt, + single, final_point); + } else { + assert (mode == NO_MATCHES); + return mcsheng64Exec16_i_nm(m, state, buf, len, offAdj, cb, ctxt, + single, final_point); + } +} + +static really_inline +u32 doNormal64_8(const struct mcsheng64 *m, const u8 **c_inout, const u8 *end, u32 s, + char do_accel, enum MatchMode mode) { + const u8 *c = *c_inout; + u32 sheng_end = m->sheng_end; + u32 accel_limit = m->accel_limit_8; + u32 accept_limit = m->accept_limit_8; + + const u32 as = m->alphaShift; + const u8 *succ_table = (const u8 *)((const char *)m + + sizeof(struct mcsheng64)); + /* Adjust start of succ table so we can index into using state id (rather + * than adjust to normal id). As we will not be processing states with low + * state ids, we will not be accessing data before the succ table. Note: due + * to the size of the sheng tables, the succ_table pointer will still be + * inside the engine.*/ + succ_table -= sheng_end << as; + + assert(s >= sheng_end); + while (c < end && s >= sheng_end) { + u8 cprime = m->remap[*c]; + DEBUG_PRINTF("c: %02hhx '%c' cp:%02hhx\n", *c, + ourisprint(*c) ? *c : '?', cprime); + s = succ_table[(s << as) + cprime]; + + DEBUG_PRINTF("s: %u\n", s); + c++; + if (do_accel) { + if (s >= accel_limit) { + break; + } + } else { + if (mode != NO_MATCHES && s >= accept_limit) { + break; + } + } + } + *c_inout = c; + return s; +} + +static really_inline +char mcsheng64Exec8_i(const struct mcsheng64 *m, u32 *state, const u8 *buf, + size_t len, u64a offAdj, NfaCallback cb, void *ctxt, + char single, const u8 **c_final, enum MatchMode mode) { + if (!len) { + *c_final = buf; + return MO_ALIVE; + } + u32 s = *state; + const u8 *c = buf; + const u8 *c_end = buf + len; + const u8 sheng_end = m->sheng_end; + + const struct mstate_aux *aux + = (const struct mstate_aux *)((const char *)m + m->aux_offset + - sizeof(struct NFA)); + u32 accept_limit = m->accept_limit_8; + + u32 cached_accept_id = 0; + u32 cached_accept_state = 0; + + DEBUG_PRINTF("accel %hu, accept %u\n", m->accel_limit_8, accept_limit); + + DEBUG_PRINTF("s: %u, len %zu\n", s, len); + + const u8 *min_accel_offset = c; + if (!m->has_accel || len < ACCEL_MIN_LEN) { + min_accel_offset = c_end; + goto without_accel; + } + + goto with_accel; + +without_accel: + do { + assert(c < min_accel_offset); + if (!s) { + goto exit; + } else if (s < sheng_end) { + s = doSheng64(m, &c, min_accel_offset, c_end, s, 0); + } else { + s = doNormal64_8(m, &c, min_accel_offset, s, 0, mode); + assert(c <= min_accel_offset); + } + + if (mode != NO_MATCHES && s >= accept_limit) { + if (mode == STOP_AT_MATCH) { + DEBUG_PRINTF("match - pausing\n"); + *state = s; + *c_final = c - 1; + return MO_MATCHES_PENDING; + } + + u64a loc = (c - 1) - buf + offAdj + 1; + if (single) { + DEBUG_PRINTF("reporting %u\n", m->arb_report); + if (cb(0, loc, m->arb_report, ctxt) == MO_HALT_MATCHING) { + return MO_DEAD; + } + } else if (doComplexReport64(cb, ctxt, m, s, loc, 0, + &cached_accept_state, + &cached_accept_id) + == MO_HALT_MATCHING) { + return MO_DEAD; + } + } + + assert(c <= c_end); /* sheng is fuzzy for min_accel_offset */ + } while (c < min_accel_offset); + + if (c == c_end) { + goto exit; + } + +with_accel: + do { + u32 accel_limit = m->accel_limit_8; + + assert(c < c_end); + if (!s) { + goto exit; + } else if (s < sheng_end) { + if (s > m->sheng_accel_limit) { + c = run_mcsheng_accel64(m, aux, s, &min_accel_offset, c, c_end); + if (c == c_end) { + goto exit; + } else { + goto without_accel; + } + } + s = doSheng64(m, &c, c_end, c_end, s, 1); + } else { + if (s >= accel_limit && aux[s].accel_offset) { + c = run_mcsheng_accel64(m, aux, s, &min_accel_offset, c, c_end); + if (c == c_end) { + goto exit; + } else { + goto without_accel; + } + } + s = doNormal64_8(m, &c, c_end, s, 1, mode); + } + + if (mode != NO_MATCHES && s >= accept_limit) { + if (mode == STOP_AT_MATCH) { + DEBUG_PRINTF("match - pausing\n"); + *state = s; + *c_final = c - 1; + return MO_MATCHES_PENDING; + } + + u64a loc = (c - 1) - buf + offAdj + 1; + if (single) { + DEBUG_PRINTF("reporting %u\n", m->arb_report); + if (cb(0, loc, m->arb_report, ctxt) == MO_HALT_MATCHING) { + return MO_DEAD; + } + } else if (doComplexReport64(cb, ctxt, m, s, loc, 0, + &cached_accept_state, + &cached_accept_id) + == MO_HALT_MATCHING) { + return MO_DEAD; + } + } + + assert(c <= c_end); + } while (c < c_end); + +exit: + *state = s; + if (mode == STOP_AT_MATCH) { + *c_final = c_end; + } + return MO_ALIVE; +} + +static never_inline +char mcsheng64Exec8_i_cb(const struct mcsheng64 *m, u32 *state, const u8 *buf, + size_t len, u64a offAdj, NfaCallback cb, void *ctxt, + char single, const u8 **final_point) { + return mcsheng64Exec8_i(m, state, buf, len, offAdj, cb, ctxt, single, + final_point, CALLBACK_OUTPUT); +} + +static never_inline +char mcsheng64Exec8_i_sam(const struct mcsheng64 *m, u32 *state, const u8 *buf, + size_t len, u64a offAdj, NfaCallback cb, void *ctxt, + char single, const u8 **final_point) { + return mcsheng64Exec8_i(m, state, buf, len, offAdj, cb, ctxt, single, + final_point, STOP_AT_MATCH); +} + +static never_inline +char mcsheng64Exec8_i_nm(const struct mcsheng64 *m, u32 *state, const u8 *buf, + size_t len, u64a offAdj, NfaCallback cb, void *ctxt, + char single, const u8 **final_point) { + return mcsheng64Exec8_i(m, state, buf, len, offAdj, cb, ctxt, single, + final_point, NO_MATCHES); +} + +static really_inline +char mcsheng64Exec8_i_ni(const struct mcsheng64 *m, u32 *state, const u8 *buf, + size_t len, u64a offAdj, NfaCallback cb, void *ctxt, + char single, const u8 **final_point, + enum MatchMode mode) { + if (mode == CALLBACK_OUTPUT) { + return mcsheng64Exec8_i_cb(m, state, buf, len, offAdj, cb, ctxt, single, + final_point); + } else if (mode == STOP_AT_MATCH) { + return mcsheng64Exec8_i_sam(m, state, buf, len, offAdj, cb, ctxt, + single, final_point); + } else { + assert(mode == NO_MATCHES); + return mcsheng64Exec8_i_nm(m, state, buf, len, offAdj, cb, ctxt, single, + final_point); + } +} + +static really_inline +char mcshengCheckEOD64(const struct NFA *nfa, u32 s, u64a offset, + NfaCallback cb, void *ctxt) { + const struct mcsheng64 *m = getImplNfa(nfa); + const struct mstate_aux *aux = get_aux64(m, s); + + if (!aux->accept_eod) { + return MO_CONTINUE_MATCHING; + } + return doComplexReport64(cb, ctxt, m, s, offset, 1, NULL, NULL); +} + +static really_inline +char nfaExecMcSheng64_16_Q2i(const struct NFA *n, u64a offset, const u8 *buffer, + const u8 *hend, NfaCallback cb, void *context, + struct mq *q, char single, s64a end, + enum MatchMode mode) { + assert(n->type == MCSHENG_64_NFA_16); + const struct mcsheng64 *m = getImplNfa(n); + s64a sp; + + assert(ISALIGNED_N(q->state, 2)); + u32 s = *(u16 *)q->state; + + if (q->report_current) { + assert(s); + assert(get_aux64(m, s)->accept); + + int rv; + if (single) { + DEBUG_PRINTF("reporting %u\n", m->arb_report); + rv = cb(0, q_cur_offset(q), m->arb_report, context); + } else { + u32 cached_accept_id = 0; + u32 cached_accept_state = 0; + + rv = doComplexReport64(cb, context, m, s, q_cur_offset(q), 0, + &cached_accept_state, &cached_accept_id); + } + + q->report_current = 0; + + if (rv == MO_HALT_MATCHING) { + return MO_DEAD; + } + } + + sp = q_cur_loc(q); + q->cur++; + + const u8 *cur_buf = sp < 0 ? hend : buffer; + + assert(q->cur); + if (mode != NO_MATCHES && q->items[q->cur - 1].location > end) { + DEBUG_PRINTF("this is as far as we go\n"); + q->cur--; + q->items[q->cur].type = MQE_START; + q->items[q->cur].location = end; + *(u16 *)q->state = s; + return MO_ALIVE; + } + + while (1) { + assert(q->cur < q->end); + s64a ep = q->items[q->cur].location; + if (mode != NO_MATCHES) { + ep = MIN(ep, end); + } + + assert(ep >= sp); + + s64a local_ep = ep; + if (sp < 0) { + local_ep = MIN(0, ep); + } + + /* do main buffer region */ + const u8 *final_look; + char rv = mcsheng64Exec16_i_ni(m, &s, cur_buf + sp, local_ep - sp, + offset + sp, cb, context, single, + &final_look, mode); + if (rv == MO_DEAD) { + *(u16 *)q->state = 0; + return MO_DEAD; + } + if (mode == STOP_AT_MATCH && rv == MO_MATCHES_PENDING) { + DEBUG_PRINTF("this is as far as we go\n"); + DEBUG_PRINTF("state %u final_look %zd\n", s, final_look - cur_buf); + + assert(q->cur); + assert(final_look != cur_buf + local_ep); + + q->cur--; + q->items[q->cur].type = MQE_START; + q->items[q->cur].location = final_look - cur_buf + 1; /* due to + * early -1 */ + *(u16 *)q->state = s; + return MO_MATCHES_PENDING; + } + + assert(rv == MO_ALIVE); + assert(q->cur); + if (mode != NO_MATCHES && q->items[q->cur].location > end) { + DEBUG_PRINTF("this is as far as we go\n"); + q->cur--; + q->items[q->cur].type = MQE_START; + q->items[q->cur].location = end; + *(u16 *)q->state = s; + return MO_ALIVE; + } + + sp = local_ep; + + if (sp == 0) { + cur_buf = buffer; + } + + if (sp != ep) { + continue; + } + + switch (q->items[q->cur].type) { + case MQE_TOP: + assert(sp + offset || !s); + if (sp + offset == 0) { + s = m->start_anchored; + break; + } + s = mcshengEnableStarts64(m, s); + break; + case MQE_END: + *(u16 *)q->state = s; + q->cur++; + return s ? MO_ALIVE : MO_DEAD; + default: + assert(!"invalid queue event"); + } + + q->cur++; + } +} + +static really_inline +char nfaExecMcSheng64_8_Q2i(const struct NFA *n, u64a offset, const u8 *buffer, + const u8 *hend, NfaCallback cb, void *context, + struct mq *q, char single, s64a end, + enum MatchMode mode) { + assert(n->type == MCSHENG_64_NFA_8); + const struct mcsheng64 *m = getImplNfa(n); + s64a sp; + + u32 s = *(u8 *)q->state; + + if (q->report_current) { + assert(s); + assert(s >= m->accept_limit_8); + + int rv; + if (single) { + DEBUG_PRINTF("reporting %u\n", m->arb_report); + + rv = cb(0, q_cur_offset(q), m->arb_report, context); + } else { + u32 cached_accept_id = 0; + u32 cached_accept_state = 0; + + rv = doComplexReport64(cb, context, m, s, q_cur_offset(q), 0, + &cached_accept_state, &cached_accept_id); + } + + q->report_current = 0; + + if (rv == MO_HALT_MATCHING) { + return MO_DEAD; + } + } + + sp = q_cur_loc(q); + q->cur++; + + const u8 *cur_buf = sp < 0 ? hend : buffer; + + if (mode != NO_MATCHES && q->items[q->cur - 1].location > end) { + DEBUG_PRINTF("this is as far as we go\n"); + q->cur--; + q->items[q->cur].type = MQE_START; + q->items[q->cur].location = end; + *(u8 *)q->state = s; + return MO_ALIVE; + } + + while (1) { + DEBUG_PRINTF("%s @ %llu\n", q->items[q->cur].type == MQE_TOP ? "TOP" : + q->items[q->cur].type == MQE_END ? "END" : "???", + q->items[q->cur].location + offset); + assert(q->cur < q->end); + s64a ep = q->items[q->cur].location; + if (mode != NO_MATCHES) { + ep = MIN(ep, end); + } + + assert(ep >= sp); + + s64a local_ep = ep; + if (sp < 0) { + local_ep = MIN(0, ep); + } + + const u8 *final_look; + char rv = mcsheng64Exec8_i_ni(m, &s, cur_buf + sp, local_ep - sp, + offset + sp, cb, context, single, + &final_look, mode); + if (rv == MO_HALT_MATCHING) { + *(u8 *)q->state = 0; + return MO_DEAD; + } + if (mode == STOP_AT_MATCH && rv == MO_MATCHES_PENDING) { + DEBUG_PRINTF("this is as far as we go\n"); + DEBUG_PRINTF("state %u final_look %zd\n", s, final_look - cur_buf); + + assert(q->cur); + assert(final_look != cur_buf + local_ep); + + q->cur--; + q->items[q->cur].type = MQE_START; + q->items[q->cur].location = final_look - cur_buf + 1; /* due to + * early -1 */ + *(u8 *)q->state = s; + return MO_MATCHES_PENDING; + } + + assert(rv == MO_ALIVE); + assert(q->cur); + if (mode != NO_MATCHES && q->items[q->cur].location > end) { + DEBUG_PRINTF("this is as far as we go\n"); + assert(q->cur); + q->cur--; + q->items[q->cur].type = MQE_START; + q->items[q->cur].location = end; + *(u8 *)q->state = s; + return MO_ALIVE; + } + + sp = local_ep; + + if (sp == 0) { + cur_buf = buffer; + } + + if (sp != ep) { + continue; + } + + switch (q->items[q->cur].type) { + case MQE_TOP: + assert(sp + offset || !s); + if (sp + offset == 0) { + s = (u8)m->start_anchored; + break; + } + s = mcshengEnableStarts64(m, s); + break; + case MQE_END: + *(u8 *)q->state = s; + q->cur++; + return s ? MO_ALIVE : MO_DEAD; + default: + assert(!"invalid queue event"); + } + + q->cur++; + } +} + +char nfaExecMcSheng64_8_Q(const struct NFA *n, struct mq *q, s64a end) { + u64a offset = q->offset; + const u8 *buffer = q->buffer; + NfaCallback cb = q->cb; + void *context = q->context; + assert(n->type == MCSHENG_64_NFA_8); + const struct mcsheng64 *m = getImplNfa(n); + const u8 *hend = q->history + q->hlength; + + return nfaExecMcSheng64_8_Q2i(n, offset, buffer, hend, cb, context, q, + m->flags & MCSHENG_FLAG_SINGLE, end, + CALLBACK_OUTPUT); +} + +char nfaExecMcSheng64_16_Q(const struct NFA *n, struct mq *q, s64a end) { + u64a offset = q->offset; + const u8 *buffer = q->buffer; + NfaCallback cb = q->cb; + void *context = q->context; + assert(n->type == MCSHENG_64_NFA_16); + const struct mcsheng64 *m = getImplNfa(n); + const u8 *hend = q->history + q->hlength; + + return nfaExecMcSheng64_16_Q2i(n, offset, buffer, hend, cb, context, q, + m->flags & MCSHENG_FLAG_SINGLE, end, + CALLBACK_OUTPUT); +} + +char nfaExecMcSheng64_8_reportCurrent(const struct NFA *n, struct mq *q) { + const struct mcsheng64 *m = getImplNfa(n); + NfaCallback cb = q->cb; + void *ctxt = q->context; + u32 s = *(u8 *)q->state; + u8 single = m->flags & MCSHENG_FLAG_SINGLE; + u64a offset = q_cur_offset(q); + assert(q_cur_type(q) == MQE_START); + assert(s); + + if (s >= m->accept_limit_8) { + if (single) { + DEBUG_PRINTF("reporting %u\n", m->arb_report); + cb(0, offset, m->arb_report, ctxt); + } else { + u32 cached_accept_id = 0; + u32 cached_accept_state = 0; + + doComplexReport64(cb, ctxt, m, s, offset, 0, &cached_accept_state, + &cached_accept_id); + } + } + + return 0; +} + +char nfaExecMcSheng64_16_reportCurrent(const struct NFA *n, struct mq *q) { + const struct mcsheng64 *m = getImplNfa(n); + NfaCallback cb = q->cb; + void *ctxt = q->context; + u32 s = *(u16 *)q->state; + const struct mstate_aux *aux = get_aux64(m, s); + u8 single = m->flags & MCSHENG_FLAG_SINGLE; + u64a offset = q_cur_offset(q); + assert(q_cur_type(q) == MQE_START); + DEBUG_PRINTF("state %u\n", s); + assert(s); + + if (aux->accept) { + if (single) { + DEBUG_PRINTF("reporting %u\n", m->arb_report); + cb(0, offset, m->arb_report, ctxt); + } else { + u32 cached_accept_id = 0; + u32 cached_accept_state = 0; + + doComplexReport64(cb, ctxt, m, s, offset, 0, &cached_accept_state, + &cached_accept_id); + } + } + + return 0; +} + +static +char mcshengHasAccept64(const struct mcsheng64 *m, const struct mstate_aux *aux, + ReportID report) { + assert(m && aux); + + if (!aux->accept) { + return 0; + } + + const struct report_list *rl = (const struct report_list *) + ((const char *)m + aux->accept - sizeof(struct NFA)); + assert(ISALIGNED_N(rl, 4)); + + DEBUG_PRINTF("report list has %u entries\n", rl->count); + + for (u32 i = 0; i < rl->count; i++) { + if (rl->report[i] == report) { + return 1; + } + } + + return 0; +} + +char nfaExecMcSheng64_8_inAccept(const struct NFA *n, ReportID report, + struct mq *q) { + assert(n && q); + + const struct mcsheng64 *m = getImplNfa(n); + u8 s = *(u8 *)q->state; + DEBUG_PRINTF("checking accepts for %hhu\n", s); + + return mcshengHasAccept64(m, get_aux64(m, s), report); +} + +char nfaExecMcSheng64_8_inAnyAccept(const struct NFA *n, struct mq *q) { + assert(n && q); + + const struct mcsheng64 *m = getImplNfa(n); + u8 s = *(u8 *)q->state; + DEBUG_PRINTF("checking accepts for %hhu\n", s); + + return !!get_aux64(m, s)->accept; +} + +char nfaExecMcSheng64_16_inAccept(const struct NFA *n, ReportID report, + struct mq *q) { + assert(n && q); + + const struct mcsheng64 *m = getImplNfa(n); + u16 s = *(u16 *)q->state; + DEBUG_PRINTF("checking accepts for %hu\n", s); + + return mcshengHasAccept64(m, get_aux64(m, s), report); +} + +char nfaExecMcSheng64_16_inAnyAccept(const struct NFA *n, struct mq *q) { + assert(n && q); + + const struct mcsheng64 *m = getImplNfa(n); + u16 s = *(u16 *)q->state; + DEBUG_PRINTF("checking accepts for %hu\n", s); + + return !!get_aux64(m, s)->accept; +} + +char nfaExecMcSheng64_8_Q2(const struct NFA *n, struct mq *q, s64a end) { + u64a offset = q->offset; + const u8 *buffer = q->buffer; + NfaCallback cb = q->cb; + void *context = q->context; + assert(n->type == MCSHENG_64_NFA_8); + const struct mcsheng64 *m = getImplNfa(n); + const u8 *hend = q->history + q->hlength; + + return nfaExecMcSheng64_8_Q2i(n, offset, buffer, hend, cb, context, q, + m->flags & MCSHENG_FLAG_SINGLE, end, + STOP_AT_MATCH); +} + +char nfaExecMcSheng64_16_Q2(const struct NFA *n, struct mq *q, s64a end) { + u64a offset = q->offset; + const u8 *buffer = q->buffer; + NfaCallback cb = q->cb; + void *context = q->context; + assert(n->type == MCSHENG_64_NFA_16); + const struct mcsheng64 *m = getImplNfa(n); + const u8 *hend = q->history + q->hlength; + + return nfaExecMcSheng64_16_Q2i(n, offset, buffer, hend, cb, context, q, + m->flags & MCSHENG_FLAG_SINGLE, end, + STOP_AT_MATCH); +} + +char nfaExecMcSheng64_8_QR(const struct NFA *n, struct mq *q, ReportID report) { + u64a offset = q->offset; + const u8 *buffer = q->buffer; + NfaCallback cb = q->cb; + void *context = q->context; + assert(n->type == MCSHENG_64_NFA_8); + const struct mcsheng64 *m = getImplNfa(n); + const u8 *hend = q->history + q->hlength; + + char rv = nfaExecMcSheng64_8_Q2i(n, offset, buffer, hend, cb, context, q, + m->flags & MCSHENG_FLAG_SINGLE, + 0 /* end */, NO_MATCHES); + if (rv && nfaExecMcSheng64_8_inAccept(n, report, q)) { + return MO_MATCHES_PENDING; + } else { + return rv; + } +} + +char nfaExecMcSheng64_16_QR(const struct NFA *n, struct mq *q, ReportID report) { + u64a offset = q->offset; + const u8 *buffer = q->buffer; + NfaCallback cb = q->cb; + void *context = q->context; + assert(n->type == MCSHENG_64_NFA_16); + const struct mcsheng64 *m = getImplNfa(n); + const u8 *hend = q->history + q->hlength; + + char rv = nfaExecMcSheng64_16_Q2i(n, offset, buffer, hend, cb, context, q, + m->flags & MCSHENG_FLAG_SINGLE, + 0 /* end */, NO_MATCHES); + + if (rv && nfaExecMcSheng64_16_inAccept(n, report, q)) { + return MO_MATCHES_PENDING; + } else { + return rv; + } +} + +char nfaExecMcSheng64_8_initCompressedState(const struct NFA *nfa, u64a offset, + void *state, UNUSED u8 key) { + const struct mcsheng64 *m = getImplNfa(nfa); + u8 s = offset ? m->start_floating : m->start_anchored; + if (s) { + *(u8 *)state = s; + return 1; + } + return 0; +} + +char nfaExecMcSheng64_16_initCompressedState(const struct NFA *nfa, u64a offset, + void *state, UNUSED u8 key) { + const struct mcsheng64 *m = getImplNfa(nfa); + u16 s = offset ? m->start_floating : m->start_anchored; + if (s) { + unaligned_store_u16(state, s); + return 1; + } + return 0; +} + +char nfaExecMcSheng64_8_testEOD(const struct NFA *nfa, const char *state, + UNUSED const char *streamState, u64a offset, + NfaCallback callback, void *context) { + return mcshengCheckEOD64(nfa, *(const u8 *)state, offset, callback, + context); +} + +char nfaExecMcSheng64_16_testEOD(const struct NFA *nfa, const char *state, + UNUSED const char *streamState, u64a offset, + NfaCallback callback, void *context) { + assert(ISALIGNED_N(state, 2)); + return mcshengCheckEOD64(nfa, *(const u16 *)state, offset, callback, + context); +} + +char nfaExecMcSheng64_8_queueInitState(UNUSED const struct NFA *nfa, struct mq *q) { + assert(nfa->scratchStateSize == 1); + *(u8 *)q->state = 0; + return 0; +} + +char nfaExecMcSheng64_16_queueInitState(UNUSED const struct NFA *nfa, struct mq *q) { + assert(nfa->scratchStateSize == 2); + assert(ISALIGNED_N(q->state, 2)); + *(u16 *)q->state = 0; + return 0; +} + +char nfaExecMcSheng64_8_queueCompressState(UNUSED const struct NFA *nfa, + const struct mq *q, UNUSED s64a loc) { + void *dest = q->streamState; + const void *src = q->state; + assert(nfa->scratchStateSize == 1); + assert(nfa->streamStateSize == 1); + *(u8 *)dest = *(const u8 *)src; + return 0; +} + +char nfaExecMcSheng64_8_expandState(UNUSED const struct NFA *nfa, void *dest, + const void *src, UNUSED u64a offset, + UNUSED u8 key) { + assert(nfa->scratchStateSize == 1); + assert(nfa->streamStateSize == 1); + *(u8 *)dest = *(const u8 *)src; + return 0; +} + +char nfaExecMcSheng64_16_queueCompressState(UNUSED const struct NFA *nfa, + const struct mq *q, + UNUSED s64a loc) { + void *dest = q->streamState; + const void *src = q->state; + assert(nfa->scratchStateSize == 2); + assert(nfa->streamStateSize == 2); + assert(ISALIGNED_N(src, 2)); + unaligned_store_u16(dest, *(const u16 *)(src)); + return 0; +} + +char nfaExecMcSheng64_16_expandState(UNUSED const struct NFA *nfa, void *dest, + const void *src, UNUSED u64a offset, + UNUSED u8 key) { + assert(nfa->scratchStateSize == 2); + assert(nfa->streamStateSize == 2); + assert(ISALIGNED_N(dest, 2)); + *(u16 *)dest = unaligned_load_u16(src); + return 0; +} +#endif diff --git a/regex/nfa/mcsheng.h b/regex/nfa/mcsheng.h new file mode 100644 index 000000000..0329e1212 --- /dev/null +++ b/regex/nfa/mcsheng.h @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2016-2020, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef MCSHENG_H +#define MCSHENG_H + +#include "callback.h" +#include "ue2common.h" + +struct mq; +struct NFA; + +/* 8-bit Sheng-McClellan hybrid */ + +char nfaExecMcSheng8_testEOD(const struct NFA *nfa, const char *state, + const char *streamState, u64a offset, + NfaCallback callback, void *context); +char nfaExecMcSheng8_Q(const struct NFA *n, struct mq *q, s64a end); +char nfaExecMcSheng8_Q2(const struct NFA *n, struct mq *q, s64a end); +char nfaExecMcSheng8_QR(const struct NFA *n, struct mq *q, ReportID report); +char nfaExecMcSheng8_reportCurrent(const struct NFA *n, struct mq *q); +char nfaExecMcSheng8_inAccept(const struct NFA *n, ReportID report, + struct mq *q); +char nfaExecMcSheng8_inAnyAccept(const struct NFA *n, struct mq *q); +char nfaExecMcSheng8_queueInitState(const struct NFA *n, struct mq *q); +char nfaExecMcSheng8_initCompressedState(const struct NFA *n, u64a offset, + void *state, u8 key); +char nfaExecMcSheng8_queueCompressState(const struct NFA *nfa, + const struct mq *q, s64a loc); +char nfaExecMcSheng8_expandState(const struct NFA *nfa, void *dest, + const void *src, u64a offset, u8 key); + +#define nfaExecMcSheng8_B_Reverse NFA_API_NO_IMPL +#define nfaExecMcSheng8_zombie_status NFA_API_ZOMBIE_NO_IMPL + +/* 16-bit Sheng-McClellan hybrid */ + +char nfaExecMcSheng16_testEOD(const struct NFA *nfa, const char *state, + const char *streamState, u64a offset, + NfaCallback callback, void *context); +char nfaExecMcSheng16_Q(const struct NFA *n, struct mq *q, s64a end); +char nfaExecMcSheng16_Q2(const struct NFA *n, struct mq *q, s64a end); +char nfaExecMcSheng16_QR(const struct NFA *n, struct mq *q, ReportID report); +char nfaExecMcSheng16_reportCurrent(const struct NFA *n, struct mq *q); +char nfaExecMcSheng16_inAccept(const struct NFA *n, ReportID report, + struct mq *q); +char nfaExecMcSheng16_inAnyAccept(const struct NFA *n, struct mq *q); +char nfaExecMcSheng16_queueInitState(const struct NFA *n, struct mq *q); +char nfaExecMcSheng16_initCompressedState(const struct NFA *n, u64a offset, + void *state, u8 key); +char nfaExecMcSheng16_queueCompressState(const struct NFA *nfa, + const struct mq *q, s64a loc); +char nfaExecMcSheng16_expandState(const struct NFA *nfa, void *dest, + const void *src, u64a offset, u8 key); + +#define nfaExecMcSheng16_B_Reverse NFA_API_NO_IMPL +#define nfaExecMcSheng16_zombie_status NFA_API_ZOMBIE_NO_IMPL +#if defined(HAVE_AVX512VBMI) +/* 64-8 bit Sheng-McClellan hybrid */ +char nfaExecMcSheng64_8_testEOD(const struct NFA *nfa, const char *state, + const char *streamState, u64a offset, + NfaCallback callback, void *context); +char nfaExecMcSheng64_8_Q(const struct NFA *n, struct mq *q, s64a end); +char nfaExecMcSheng64_8_Q2(const struct NFA *n, struct mq *q, s64a end); +char nfaExecMcSheng64_8_QR(const struct NFA *n, struct mq *q, ReportID report); +char nfaExecMcSheng64_8_reportCurrent(const struct NFA *n, struct mq *q); +char nfaExecMcSheng64_8_inAccept(const struct NFA *n, ReportID report, + struct mq *q); +char nfaExecMcSheng64_8_inAnyAccept(const struct NFA *n, struct mq *q); +char nfaExecMcSheng64_8_queueInitState(const struct NFA *n, struct mq *q); +char nfaExecMcSheng64_8_initCompressedState(const struct NFA *n, u64a offset, + void *state, u8 key); +char nfaExecMcSheng64_8_queueCompressState(const struct NFA *nfa, + const struct mq *q, s64a loc); +char nfaExecMcSheng64_8_expandState(const struct NFA *nfa, void *dest, + const void *src, u64a offset, u8 key); + +#define nfaExecMcSheng64_8_B_Reverse NFA_API_NO_IMPL +#define nfaExecMcSheng64_8_zombie_status NFA_API_ZOMBIE_NO_IMPL + +/* 64-16 bit Sheng-McClellan hybrid */ +char nfaExecMcSheng64_16_testEOD(const struct NFA *nfa, const char *state, + const char *streamState, u64a offset, + NfaCallback callback, void *context); +char nfaExecMcSheng64_16_Q(const struct NFA *n, struct mq *q, s64a end); +char nfaExecMcSheng64_16_Q2(const struct NFA *n, struct mq *q, s64a end); +char nfaExecMcSheng64_16_QR(const struct NFA *n, struct mq *q, ReportID report); +char nfaExecMcSheng64_16_reportCurrent(const struct NFA *n, struct mq *q); +char nfaExecMcSheng64_16_inAccept(const struct NFA *n, ReportID report, + struct mq *q); +char nfaExecMcSheng64_16_inAnyAccept(const struct NFA *n, struct mq *q); +char nfaExecMcSheng64_16_queueInitState(const struct NFA *n, struct mq *q); +char nfaExecMcSheng64_16_initCompressedState(const struct NFA *n, u64a offset, + void *state, u8 key); +char nfaExecMcSheng64_16_queueCompressState(const struct NFA *nfa, + const struct mq *q, s64a loc); +char nfaExecMcSheng64_16_expandState(const struct NFA *nfa, void *dest, + const void *src, u64a offset, u8 key); +#define nfaExecMcSheng64_16_B_Reverse NFA_API_NO_IMPL +#define nfaExecMcSheng64_16_zombie_status NFA_API_ZOMBIE_NO_IMPL +#else // !HAVE_AVX512VBMI +#define nfaExecMcSheng64_8_B_Reverse NFA_API_NO_IMPL +#define nfaExecMcSheng64_8_zombie_status NFA_API_ZOMBIE_NO_IMPL +#define nfaExecMcSheng64_8_Q NFA_API_NO_IMPL +#define nfaExecMcSheng64_8_Q2 NFA_API_NO_IMPL +#define nfaExecMcSheng64_8_QR NFA_API_NO_IMPL +#define nfaExecMcSheng64_8_inAccept NFA_API_NO_IMPL +#define nfaExecMcSheng64_8_inAnyAccept NFA_API_NO_IMPL +#define nfaExecMcSheng64_8_queueInitState NFA_API_NO_IMPL +#define nfaExecMcSheng64_8_queueCompressState NFA_API_NO_IMPL +#define nfaExecMcSheng64_8_expandState NFA_API_NO_IMPL +#define nfaExecMcSheng64_8_initCompressedState NFA_API_NO_IMPL +#define nfaExecMcSheng64_8_testEOD NFA_API_NO_IMPL +#define nfaExecMcSheng64_8_reportCurrent NFA_API_NO_IMPL + +#define nfaExecMcSheng64_16_B_Reverse NFA_API_NO_IMPL +#define nfaExecMcSheng64_16_zombie_status NFA_API_ZOMBIE_NO_IMPL +#define nfaExecMcSheng64_16_Q NFA_API_NO_IMPL +#define nfaExecMcSheng64_16_Q2 NFA_API_NO_IMPL +#define nfaExecMcSheng64_16_QR NFA_API_NO_IMPL +#define nfaExecMcSheng64_16_inAccept NFA_API_NO_IMPL +#define nfaExecMcSheng64_16_inAnyAccept NFA_API_NO_IMPL +#define nfaExecMcSheng64_16_queueInitState NFA_API_NO_IMPL +#define nfaExecMcSheng64_16_queueCompressState NFA_API_NO_IMPL +#define nfaExecMcSheng64_16_expandState NFA_API_NO_IMPL +#define nfaExecMcSheng64_16_initCompressedState NFA_API_NO_IMPL +#define nfaExecMcSheng64_16_testEOD NFA_API_NO_IMPL +#define nfaExecMcSheng64_16_reportCurrent NFA_API_NO_IMPL + +#endif //end of HAVE_AVX512VBMI + +#endif diff --git a/regex/nfa/mcsheng_data.c b/regex/nfa/mcsheng_data.c new file mode 100644 index 000000000..0701b4b31 --- /dev/null +++ b/regex/nfa/mcsheng_data.c @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2016-2020, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "mcsheng_internal.h" + +/* This table is in a separate translation unit from mcsheng.c as we want to + * prevent the compiler from seeing these constants. We have the load resources + * free at runtime to load the masks with no problems. */ +const u64a mcsheng_pext_mask[8] = { + 0, /* dummy */ + 0x000000000000ff0f, + 0x0000000000ff000f, + 0x00000000ff00000f, + 0x000000ff0000000f, + 0x0000ff000000000f, + 0x00ff00000000000f, + 0xff0000000000000f, +}; +#if defined(HAVE_AVX512VBMI) +const u64a mcsheng64_pext_mask[8] = { + 0, /* dummy */ + 0x000000000000ff3f, + 0x0000000000ff003f, + 0x00000000ff00003f, + 0x000000ff0000003f, + 0x0000ff000000003f, + 0x00ff00000000003f, + 0xff0000000000003f, +}; +#endif diff --git a/regex/nfa/mcsheng_internal.h b/regex/nfa/mcsheng_internal.h new file mode 100644 index 000000000..d98557462 --- /dev/null +++ b/regex/nfa/mcsheng_internal.h @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2016-2020, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef MCSHENG_INTERNAL_H +#define MCSHENG_INTERNAL_H + +#include "nfa_internal.h" +#include "ue2common.h" +#include "util/simd_types.h" + +#define ACCEPT_FLAG 0x8000 +#define ACCEL_FLAG 0x4000 +#define STATE_MASK 0x3fff + +#define SHERMAN_STATE 1 + +#define SHERMAN_TYPE_OFFSET 0 +#define SHERMAN_FIXED_SIZE 32 + +#define SHERMAN_LEN_OFFSET 1 +#define SHERMAN_DADDY_OFFSET 2 +#define SHERMAN_CHARS_OFFSET 4 +#define SHERMAN_STATES_OFFSET(sso_len) (4 + (sso_len)) + +struct report_list { + u32 count; + ReportID report[]; +}; + +struct mstate_aux { + u32 accept; + u32 accept_eod; + u16 top; + u32 accel_offset; /* relative to start of struct mcsheng; 0 if no accel */ +}; + +#define MCSHENG_FLAG_SINGLE 1 /**< we raise only single accept id */ + +struct mcsheng { + u16 state_count; /**< total number of states */ + u32 length; /**< length of dfa in bytes */ + u16 start_anchored; /**< anchored start state */ + u16 start_floating; /**< floating start state */ + u32 aux_offset; /**< offset of the aux structures relative to the start of + * the nfa structure */ + u32 sherman_offset; /**< offset of array of sherman state offsets the + * state_info structures relative to the start of the + * nfa structure */ + u32 sherman_end; /**< offset of the end of the state_info structures + * relative to the start of the nfa structure */ + u16 sheng_end; /**< first non-sheng state */ + u16 sheng_accel_limit; /**< first sheng accel state. state given in terms of + * internal sheng ids */ + u16 accel_limit_8; /**< 8 bit, lowest accelerable state */ + u16 accept_limit_8; /**< 8 bit, lowest accept state */ + u16 sherman_limit; /**< lowest sherman state */ + u8 alphaShift; + u8 flags; + u8 has_accel; /**< 1 iff there are any accel plans */ + u8 remap[256]; /**< remaps characters to a smaller alphabet */ + ReportID arb_report; /**< one of the accepts that this dfa may raise */ + u32 accel_offset; /**< offset of accel structures from start of McClellan */ + m128 sheng_masks[N_CHARS]; +}; + +/* pext masks for the runtime to access appropriately copies of bytes 1..7 + * representing the data from a u64a. */ +extern const u64a mcsheng_pext_mask[8]; + +struct mcsheng64 { + u16 state_count; /**< total number of states */ + u32 length; /**< length of dfa in bytes */ + u16 start_anchored; /**< anchored start state */ + u16 start_floating; /**< floating start state */ + u32 aux_offset; /**< offset of the aux structures relative to the start of + * the nfa structure */ + u32 sherman_offset; /**< offset of array of sherman state offsets the + * state_info structures relative to the start of the + * nfa structure */ + u32 sherman_end; /**< offset of the end of the state_info structures + * relative to the start of the nfa structure */ + u16 sheng_end; /**< first non-sheng state */ + u16 sheng_accel_limit; /**< first sheng accel state. state given in terms of + * internal sheng ids */ + u16 accel_limit_8; /**< 8 bit, lowest accelerable state */ + u16 accept_limit_8; /**< 8 bit, lowest accept state */ + u16 sherman_limit; /**< lowest sherman state */ + u8 alphaShift; + u8 flags; + u8 has_accel; /**< 1 iff there are any accel plans */ + u8 remap[256]; /**< remaps characters to a smaller alphabet */ + ReportID arb_report; /**< one of the accepts that this dfa may raise */ + u32 accel_offset; /**< offset of accel structures from start of McClellan */ + m512 sheng_succ_masks[N_CHARS]; +}; + +extern const u64a mcsheng64_pext_mask[8]; + +#endif diff --git a/regex/nfa/mpv.c b/regex/nfa/mpv.c new file mode 100644 index 000000000..d03009b20 --- /dev/null +++ b/regex/nfa/mpv.c @@ -0,0 +1,1100 @@ +/* + * Copyright (c) 2015-2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "mpv.h" + +#include "mpv_internal.h" +#include "nfa_api.h" +#include "nfa_api_queue.h" +#include "nfa_internal.h" +#include "shufti.h" +#include "truffle.h" +#include "ue2common.h" +#include "vermicelli.h" +#include "vermicelli_run.h" +#include "util/multibit.h" +#include "util/partial_store.h" +#include "util/simd_utils.h" +#include "util/unaligned.h" + +#ifndef __KERNEL__ +#include +#else +#include +#endif + +#define MIN_SKIP_REPEAT 32 + +typedef struct mpv_pq_item PQ_T; +#define PQ_COMP(pqc_items, a, b) \ + ((pqc_items)[a].trigger_loc < (pqc_items)[b].trigger_loc) +#define PQ_COMP_B(pqc_items, a, b_fixed) \ + ((pqc_items)[a].trigger_loc < (b_fixed).trigger_loc) + +#include "util/pqueue.h" + +static really_inline +u64a *get_counter_n(struct mpv_decomp_state *s, + const struct mpv *m, u32 n) { + return (u64a *)((char *)s + get_counter_info(m)[n].counter_offset); +} + +static really_inline +u64a *get_counter_for_kilo(struct mpv_decomp_state *s, + const struct mpv_kilopuff *kp) { + return (u64a *)((char *)s + kp->counter_offset); +} + +static really_inline +u64a get_counter_value_for_kilo(struct mpv_decomp_state *s, + const struct mpv_kilopuff *kp) { + return *get_counter_for_kilo(s, kp) + s->counter_adj; +} + +static really_inline +const u64a *get_counter_for_kilo_c(const struct mpv_decomp_state *s, + const struct mpv_kilopuff *kp) { + return (const u64a *)((const char *)s + kp->counter_offset); +} + + +static never_inline +void normalize_counters(struct mpv_decomp_state *dstate, const struct mpv *m) { + u64a adj = dstate->counter_adj; + u64a *counters = get_counter_n(dstate, m, 0); + + if (!adj) { + return; + } + + for (u32 i = 0; i < m->counter_count; i++) { + /* update all counters - alive or dead */ + counters[i] += adj; + DEBUG_PRINTF("counter %u: %llu\n", i, counters[i]); + } + + dstate->counter_adj = 0; +} + +static really_inline +char processReports(const struct mpv *m, u8 *reporters, + const struct mpv_decomp_state *dstate, u64a counter_adj, + u64a report_offset, NfaCallback cb, void *ctxt, + ReportID *rl, u32 *rl_count_out) { + DEBUG_PRINTF("reporting at offset %llu\n", report_offset); + const struct mpv_kilopuff *kp = (const void *)(m + 1); + u32 rl_count = 0; + + for (u32 i = mmbit_iterate(reporters, m->kilo_count, MMB_INVALID); + i != MMB_INVALID; i = mmbit_iterate(reporters, m->kilo_count, i)) { + const struct mpv_puffette *curr = dstate->active[i].curr; + u64a curr_counter_val = *get_counter_for_kilo_c(dstate, &kp[i]) + + counter_adj; + DEBUG_PRINTF("kilo %u, underlying counter: %llu current: %llu\n", i, + *get_counter_for_kilo_c(dstate, &kp[i]), curr_counter_val); + assert(curr_counter_val != MPV_DEAD_VALUE); /* counter_adj should take + * care if underlying value + * is -1 */ + char did_stuff = 0; + + while (curr->report != INVALID_REPORT) { + assert(curr_counter_val >= curr->repeats); + if (curr->unbounded || curr_counter_val == curr->repeats) { + DEBUG_PRINTF("report %u at %llu\n", curr->report, + report_offset); + + if (curr->unbounded && !curr->simple_exhaust) { + assert(rl_count < m->puffette_count); + *rl = curr->report; + ++rl; + rl_count++; + } + + if (cb(0, report_offset, curr->report, ctxt) == + MO_HALT_MATCHING) { + DEBUG_PRINTF("bailing\n"); + return MO_HALT_MATCHING; + } + did_stuff = 1; + } + + curr--; + } + + if (!did_stuff) { + mmbit_unset(reporters, m->kilo_count, i); + } + } + + *rl_count_out = rl_count; + return MO_CONTINUE_MATCHING; +} + +static +ReportID *get_report_list(const struct mpv *m, struct mpv_decomp_state *s) { + return (ReportID *)((char *)s + m->report_list_offset); +} + +static really_inline +char processReportsForRange(const struct mpv *m, u8 *reporters, + struct mpv_decomp_state *dstate, u64a first_offset, + size_t length, NfaCallback cb, void *ctxt) { + if (!length) { + return MO_CONTINUE_MATCHING; + } + + u64a counter_adj = dstate->counter_adj; + u32 rl_count = 0; + ReportID *rl = get_report_list(m, dstate); + char rv = processReports(m, reporters, dstate, 1 + counter_adj, + first_offset + 1, cb, ctxt, rl, &rl_count); + if (rv != MO_CONTINUE_MATCHING) { + DEBUG_PRINTF("bailing\n"); + return rv; + } + if (!rl_count) { + return MO_CONTINUE_MATCHING; + } + + DEBUG_PRINTF("length=%zu, rl_count=%u\n", length, rl_count); + + for (size_t i = 2; i <= length; i++) { + for (u32 j = 0; j < rl_count; j++) { + if (cb(0, first_offset + i, rl[j], ctxt) == MO_HALT_MATCHING) { + DEBUG_PRINTF("bailing\n"); + return MO_HALT_MATCHING; + } + } + } + + return MO_CONTINUE_MATCHING; +} + +/* returns last puffette that we have satisfied */ +static +const struct mpv_puffette *get_curr_puff(const struct mpv *m, + const struct mpv_kilopuff *kp, + struct mpv_decomp_state *dstate) { + u64a counter = *get_counter_for_kilo(dstate, kp); + assert(counter != MPV_DEAD_VALUE); + + const struct mpv_puffette *p = get_puff_array(m, kp); + DEBUG_PRINTF("looking for current puffette (counter = %llu)\n", counter); + DEBUG_PRINTF("next: (%u, %u)\n", p->repeats, p->report); + while (counter + 1 >= p->repeats && p->report != INVALID_REPORT) { + DEBUG_PRINTF("advancing\n"); + ++p; + DEBUG_PRINTF("next: (%u, %u)\n", p->repeats, p->report); + } + + return p - 1; +} + +static +const struct mpv_puffette *get_init_puff(const struct mpv *m, + const struct mpv_kilopuff *kp) { + const struct mpv_puffette *p = get_puff_array(m, kp); + while (p->repeats == 1) { + ++p; + } + return p - 1; +} + + +/* returns the last puffette whose repeats have been satisfied */ +static really_inline +const struct mpv_puffette *update_curr_puff(const struct mpv *m, u8 *reporters, + u64a counter, + const struct mpv_puffette *in, + u32 kilo_index) { + assert(counter != MPV_DEAD_VALUE); + + const struct mpv_puffette *p = in; + DEBUG_PRINTF("looking for current puffette (counter = %llu)\n", counter); + DEBUG_PRINTF("curr: (%u, %u)\n", p->repeats, p->report); + while (counter + 1 >= p[1].repeats && p[1].report != INVALID_REPORT) { + DEBUG_PRINTF("advancing\n"); + ++p; + DEBUG_PRINTF("curr: (%u, %u)\n", p->repeats, p->report); + } + + if (p != in) { + mmbit_set(reporters, m->kilo_count, kilo_index); + } + + return p; +} + +static really_inline +size_t limitByReach(const struct mpv_kilopuff *kp, const u8 *buf, + size_t length) { + if (kp->type == MPV_VERM) { + return vermicelliExec(kp->u.verm.c, 0, buf, buf + length) - buf; + } else if (kp->type == MPV_SHUFTI) { + m128 mask_lo = kp->u.shuf.mask_lo; + m128 mask_hi = kp->u.shuf.mask_hi; + return shuftiExec(mask_lo, mask_hi, buf, buf + length) - buf; + } else if (kp->type == MPV_TRUFFLE) { + return truffleExec(kp->u.truffle.mask1, kp->u.truffle.mask2, buf, buf + length) - buf; + } else if (kp->type == MPV_NVERM) { + return nvermicelliExec(kp->u.verm.c, 0, buf, buf + length) - buf; + } + + assert(kp->type == MPV_DOT); + return length; +} + +static never_inline +void fillLimits(const struct mpv *m, u8 *active, u8 *reporters, + struct mpv_decomp_state *dstate, struct mpv_pq_item *pq, + const u8 *buf, size_t length) { + DEBUG_PRINTF("filling limits %zu\n", length); + assert(!dstate->pq_size); + + if (!length) { + DEBUG_PRINTF("0 length\n"); + return; + } + + const struct mpv_kilopuff *kp = (const void *)(m + 1); + + for (u32 i = mmbit_iterate(active, m->kilo_count, MMB_INVALID); + i != MMB_INVALID; i = mmbit_iterate(active, m->kilo_count, i)) { + dstate->active[i].curr = get_curr_puff(m, &kp[i], dstate); + if (dstate->active[i].curr->report != INVALID_REPORT) { + /* this kilo puff may fire reports */ + mmbit_set(reporters, m->kilo_count, i); + } + + u64a lim = limitByReach(&kp[i], buf, length); + DEBUG_PRINTF("lim %llu/%zu\n", lim, length); + + if (kp[i].dead_point != MPV_DEAD_VALUE) { + assert(!kp[i].auto_restart); + u64a counter = get_counter_value_for_kilo(dstate, &kp[i]); + u64a dp_trigger = kp[i].dead_point - counter; + if (dp_trigger < lim) { + DEBUG_PRINTF("dead point trigger %llu\n", dp_trigger); + lim = dp_trigger; + } + } + + if (kp[i].auto_restart && !lim) { + *get_counter_for_kilo(dstate, &kp[i]) = MPV_DEAD_VALUE; + mmbit_unset(reporters, m->kilo_count, i); + /* the counter value will cause the nex_trigger calculation below to + * adjust correctly */ + if (length == 1) { + dstate->active[i].limit = 0; + continue; + } + + lim = limitByReach(&kp[i], buf + 1, length - 1) + 1; + + + /* restart active counters */ + dstate->active[i].curr = get_init_puff(m, &kp[i]); + assert(dstate->active[i].curr[0].report == INVALID_REPORT); + + DEBUG_PRINTF("lim now %llu/%zu\n", lim, length); + } + + dstate->active[i].limit = lim; + if (!lim) { + mmbit_unset(active, m->kilo_count, i); + mmbit_unset(reporters, m->kilo_count, i); + continue; + } + if (dstate->active[i].curr[1].report != INVALID_REPORT) { + u32 next_trigger = dstate->active[i].curr[1].repeats - 1ULL + - *get_counter_for_kilo(dstate, &kp[i]); + DEBUG_PRINTF("next trigger %u\n", next_trigger); + lim = MIN(lim, next_trigger); + } + + if (lim != length) { + struct mpv_pq_item temp = { + .trigger_loc = lim, + .kilo = i + }; + + DEBUG_PRINTF("push for %u at %llu\n", i, lim); + pq_insert(pq, dstate->pq_size, temp); + ++dstate->pq_size; + } + + assert(lim || kp[i].auto_restart); + } + + DEBUG_PRINTF("filled\n"); + dstate->filled = 1; +} + +static never_inline +void handleTopN(const struct mpv *m, s64a loc, u8 *active, u8 *reporters, + struct mpv_decomp_state *dstate, struct mpv_pq_item *pq, + const u8 *buf, size_t length, u32 i) { + assert(i < m->kilo_count); + DEBUG_PRINTF("MQE_TOP + %u @%lld\n", i, loc); + if (mmbit_set(active, m->kilo_count, i)) { + DEBUG_PRINTF("kilo is already alive and kicking\n"); + return; + } + + const struct mpv_kilopuff *kp = (const struct mpv_kilopuff *)(m + 1); + + assert(!kp[i].auto_restart); /* handle later/never */ + + /* we need to ensure that the counters are upto date */ + normalize_counters(dstate, m); + + /* reset counter */ + *get_counter_for_kilo(dstate, &kp[i]) = 0; + + if ((size_t)loc == length) { + /* end of buffer, just make sure it is active */ + dstate->active[i].limit = loc; + dstate->active[i].curr = get_init_puff(m, &kp[i]); + return; + } + + /* find the limit */ + u64a lim = limitByReach(&kp[i], buf + loc, length - loc) + loc; + + /* no need to worry about dead_point triggers here as kilopuff must first + * update chain (to fire a report) before it goes dead. */ + + if (lim == (u64a)loc) { + DEBUG_PRINTF("dead on arrival\n"); + mmbit_unset(active, m->kilo_count, i); + return; + } + dstate->active[i].limit = lim; + + /* setup puffette, find next trigger */ + dstate->active[i].curr = get_init_puff(m, &kp[i]); + if (dstate->active[i].curr[1].report != INVALID_REPORT) { + u32 next_trigger = dstate->active[i].curr[1].repeats - 1ULL + loc; + lim = MIN(lim, next_trigger); + } + + assert(dstate->active[i].curr[0].repeats == 1 + || dstate->active[i].curr[0].report == INVALID_REPORT); + if (dstate->active[i].curr[0].repeats == 1) { + DEBUG_PRINTF("yippee\n"); + mmbit_set(reporters, m->kilo_count, i); + } + + assert(lim > (u64a)loc); + + /* add to pq */ + if (lim != length) { + struct mpv_pq_item temp = { + .trigger_loc = lim, + .kilo = i + }; + + DEBUG_PRINTF("push for %u at %llu\n", i, lim); + pq_insert(pq, dstate->pq_size, temp); + ++dstate->pq_size; + } +} + +static really_inline +void killKilo(const struct mpv *m, u8 *active, u8 *reporters, + struct mpv_decomp_state *dstate, struct mpv_pq_item *pq, u32 i) { + DEBUG_PRINTF("squashing kilo %u (progress %llu, limit %llu)\n", + i, pq_top(pq)->trigger_loc, dstate->active[i].limit); + mmbit_unset(active, m->kilo_count, i); + mmbit_unset(reporters, m->kilo_count, i); + + pq_pop(pq, dstate->pq_size); + dstate->pq_size--; +} + +static really_inline +void updateKiloChains(const struct mpv *m, u8 *reporters, + struct mpv_decomp_state *dstate, struct mpv_pq_item *pq, + u64a curr_loc, size_t buf_length, u32 i) { + const struct mpv_kilopuff *kp = (const void *)(m + 1); + u64a counter = get_counter_value_for_kilo(dstate, &kp[i]); + + DEBUG_PRINTF("updating active puff for kilo %u\n", i); + dstate->active[i].curr = update_curr_puff(m, reporters, counter, + dstate->active[i].curr, i); + + u64a next_trigger = dstate->active[i].limit; + + if (dstate->active[i].curr[1].report != INVALID_REPORT) { + u64a next_rep_trigger = dstate->active[i].curr[1].repeats - 1 - counter + + curr_loc; + + next_trigger = MIN(next_trigger, next_rep_trigger); + } else if (kp[i].dead_point != MPV_DEAD_VALUE) { + u64a dp_trigger = kp[i].dead_point - counter + curr_loc; + DEBUG_PRINTF("dead point trigger %llu\n", dp_trigger); + if (dp_trigger < dstate->active[i].limit) { + dstate->active[i].limit = dp_trigger; + next_trigger = dp_trigger; + } + } + + DEBUG_PRINTF("next trigger location is %llu\n", next_trigger); + + if (next_trigger < buf_length) { + assert(dstate->pq_size <= m->kilo_count); + assert(next_trigger > pq_top(pq)->trigger_loc); + struct mpv_pq_item temp = { + .trigger_loc = next_trigger, + .kilo = i + }; + + DEBUG_PRINTF("(replace) push for %u at %llu\n", i, next_trigger); + pq_replace_top(pq, dstate->pq_size, temp); + } else { + pq_pop(pq, dstate->pq_size); + dstate->pq_size--; + DEBUG_PRINTF("PQ_POP\n"); + } + DEBUG_PRINTF("pq size now %u next top %llu\n", dstate->pq_size, + pq_top(pq)->trigger_loc); +} + +static really_inline +u8 do_single_shufti(const m128 l, const m128 h, u8 c) { + const u8 *lo = (const u8 *)&l; + const u8 *hi = (const u8 *)&h; + return lo[c & 0xf] & hi[c >> 4]; +} + +static really_inline +size_t find_last_bad(const struct mpv_kilopuff *kp, const u8 *buf, + size_t length, size_t curr, u32 min_rep) { + assert(kp->type != MPV_DOT); + + DEBUG_PRINTF("repeats = %u\n", min_rep); + /* TODO: this should be replace by some sort of simd stuff */ + + if (kp->type == MPV_VERM) { + if (min_rep < MIN_SKIP_REPEAT) { + return find_nverm_run(kp->u.verm.c, 0, min_rep, buf, buf + curr, + buf + length) - buf - 1; + } + + verm_restart:; + assert(buf[curr] == kp->u.verm.c); + size_t test = curr; + if (curr + min_rep < length) { + test = curr + min_rep; + } else { + test = length - 1; + } + + while (test > curr) { + if (buf[test] == kp->u.verm.c) { + curr = test; + if (curr == length - 1) { + return curr; + } + goto verm_restart; + } + --test; + } + } else if (kp->type == MPV_SHUFTI) { + m128 lo = kp->u.shuf.mask_lo; + m128 hi = kp->u.shuf.mask_hi; + shuf_restart: + assert(do_single_shufti(lo, hi, buf[curr])); + size_t test = curr; + if (curr + min_rep < length) { + test = curr + min_rep; + } else { + test = length - 1; + } + + while (test > curr) { + if (do_single_shufti(lo, hi, buf[test])) { + DEBUG_PRINTF("updating curr from %zu to %zu\n", curr, test); + curr = test; + if (curr == length - 1) { + return curr; + } + goto shuf_restart; + } + --test; + } + } else if (kp->type == MPV_TRUFFLE) { + const m128 mask1 = kp->u.truffle.mask1; + const m128 mask2 = kp->u.truffle.mask2; + truffle_restart:; + size_t test = curr; + if (curr + min_rep < length) { + test = curr + min_rep; + } else { + test = length - 1; + } + + while (test > curr) { + const u8 *rv = truffleExec(mask1, mask2, buf + test, buf + test + 1); + if (rv == buf + test) { + curr = test; + if (curr == length - 1) { + return curr; + } + goto truffle_restart; + } + --test; + } + } else if (kp->type == MPV_NVERM) { + if (min_rep < MIN_SKIP_REPEAT) { + return find_verm_run(kp->u.verm.c, 0, min_rep, buf, buf + curr, + buf + length) - buf - 1; + } + + nverm_restart:; + assert(buf[curr] != kp->u.verm.c); + size_t test = curr; + if (curr + min_rep < length) { + test = curr + min_rep; + } else { + test = length - 1; + } + + while (test > curr) { + if (buf[test] != kp->u.verm.c) { + curr = test; + if (curr == length - 1) { + return curr; + } + goto nverm_restart; + } + --test; + } + } else { + assert(0); + } + + return curr; +} + +static really_inline +void restartKilo(const struct mpv *m, UNUSED u8 *active, u8 *reporters, + struct mpv_decomp_state *dstate, struct mpv_pq_item *pq, + const u8 *buf, u64a prev_limit, size_t buf_length, u32 i) { + const struct mpv_kilopuff *kp = (const void *)(m + 1); + assert(kp[i].auto_restart); + assert(mmbit_isset(active, m->kilo_count, i)); + + DEBUG_PRINTF("we got to %llu,%llu\n", prev_limit, dstate->active[i].limit); + assert(prev_limit == dstate->active[i].limit); + + DEBUG_PRINTF("resetting counter\n"); + + /* we need to ensure that the counters are upto date */ + normalize_counters(dstate, m); + + /* current byte is dead, will wrap to 0 after processing this byte */ + assert(MPV_DEAD_VALUE + 1 == 0); + *get_counter_for_kilo(dstate, &kp[i]) = MPV_DEAD_VALUE; + + DEBUG_PRINTF("resetting puffettes\n"); + dstate->active[i].curr = get_init_puff(m, &kp[i]); + + assert(dstate->active[i].curr[0].report == INVALID_REPORT); + /* TODO: handle restart .{1,}s */ + + mmbit_unset(reporters, m->kilo_count, i); + + if (prev_limit != buf_length - 1) { + size_t last_bad = find_last_bad(&kp[i], buf, buf_length, prev_limit, + dstate->active[i].curr[1].repeats); + assert(last_bad >= prev_limit && last_bad < buf_length); + if (last_bad != prev_limit) { + /* there is no point in getting restarted at this location */ + dstate->active[i].limit = last_bad; + assert(dstate->pq_size <= m->kilo_count); + struct mpv_pq_item temp = { + .trigger_loc = last_bad, + .kilo = i + }; + + pq_replace_top(pq, dstate->pq_size, temp); + return; + } + } + + /* TODO: skipping would really come in handy about now */ + u64a lim; + if (buf_length > prev_limit + 1) { + lim = limitByReach(&kp[i], buf + prev_limit + 1, + buf_length - (prev_limit + 1)) + + prev_limit + 1; + } else { + assert(buf_length == prev_limit + 1); + lim = buf_length; + } + DEBUG_PRINTF("next limit is %llu\n", lim); + + assert(lim > prev_limit); + + dstate->active[i].limit = lim; + + if (dstate->active[i].curr[1].report != INVALID_REPORT) { + u32 next_trigger = dstate->active[i].curr[1].repeats + prev_limit; + lim = MIN(lim, next_trigger); + } + + DEBUG_PRINTF("next trigger for kilo at %llu\n", lim); + + if (lim < buf_length) { + assert(dstate->pq_size <= m->kilo_count); + assert(lim >= prev_limit); + struct mpv_pq_item temp = { + .trigger_loc = lim, + .kilo = i + }; + + pq_replace_top(pq, dstate->pq_size, temp); + } else { + pq_pop(pq, dstate->pq_size); + dstate->pq_size--; + } +} + +static really_inline +void handle_events(const struct mpv *m, u8 *active, u8 *reporters, + struct mpv_decomp_state *dstate, struct mpv_pq_item *pq, + u64a loc, const u8 *buf, size_t buf_length) { + const struct mpv_kilopuff *kp = (const void *)(m + 1); + + while (dstate->pq_size && pq_top(pq)->trigger_loc <= loc) { + assert(pq_top(pq)->trigger_loc == loc); + + u32 kilo = pq_top(pq)->kilo; + + DEBUG_PRINTF("pop for kilo %u at %llu\n", kilo, + pq_top(pq)->trigger_loc); + + if (dstate->active[kilo].limit <= loc) { + if (!kp[kilo].auto_restart) { + killKilo(m, active, reporters, dstate, pq, kilo); + } else { + restartKilo(m, active, reporters, dstate, pq, buf, loc, + buf_length, kilo); + } + } else { + updateKiloChains(m, reporters, dstate, pq, loc, buf_length, kilo); + } + } +} + +static really_inline +u64a find_next_limit(const struct mpv *m, u8 *active, u8 *reporters, + struct mpv_decomp_state *dstate, struct mpv_pq_item *pq, + const u8 *buf, u64a prev_limit, u64a ep, + size_t buf_length) { + u64a limit = ep; + + DEBUG_PRINTF("length %llu (prev %llu), pq %u\n", limit, prev_limit, + dstate->pq_size); + + handle_events(m, active, reporters, dstate, pq, prev_limit, buf, + buf_length); + + if (dstate->pq_size) { + limit = MIN(pq_top(pq)->trigger_loc, limit); + assert(limit > prev_limit); + } + + DEBUG_PRINTF("limit now %llu\n", limit); + return limit; +} + +static really_inline +char mpvExec(const struct mpv *m, u8 *active, u8 *reporters, + struct mpv_decomp_state *dstate, struct mpv_pq_item *pq, + const u8 *buf, s64a start, size_t length, size_t buf_length, + u64a offsetAdj, NfaCallback cb, void *ctxt) { + DEBUG_PRINTF("running mpv (s %lliu, l %zu, o %llu)\n", + *get_counter_n(dstate, m, 0) + dstate->counter_adj, length, + offsetAdj); + + u64a progress = start; /* progress is relative to buffer offsets */ + + while (progress < length) { + DEBUG_PRINTF("progress %llu\n", progress); + + /* find next limit and update chains */ + u64a limit = find_next_limit(m, active, reporters, dstate, pq, buf, + progress, length, buf_length); + assert(limit != progress); + u64a incr = limit - progress; + DEBUG_PRINTF("incr = %llu\n", incr); + + /* report matches upto next limit */ + char rv = processReportsForRange(m, reporters, dstate, + offsetAdj + progress, limit - progress, + cb, ctxt); + + if (rv != MO_CONTINUE_MATCHING) { + DEBUG_PRINTF("mpvExec done %llu/%zu\n", progress, length); + return rv; + } + + dstate->counter_adj += incr; + progress = limit; + } + + assert(progress == length); + + DEBUG_PRINTF("mpvExec done\n"); + return MO_CONTINUE_MATCHING; +} + +static really_inline +void mpvLoadState(struct mpv_decomp_state *out, const struct NFA *n, + const char *state) { + assert(16 >= sizeof(struct mpv_decomp_kilo)); + assert(sizeof(*out) <= n->scratchStateSize); + assert(ISALIGNED(out)); + + const struct mpv *m = getImplNfa(n); + const struct mpv_counter_info *counter_info = get_counter_info(m); + u64a *counters = get_counter_n(out, m, 0); + const char *comp_counter = state; + for (u32 i = 0; i < m->counter_count; i++) { + u32 counter_size = counter_info[i].counter_size; + counters[i] = partial_load_u64a(comp_counter, counter_size); + DEBUG_PRINTF("loaded %llu counter %u\n", counters[i], i); + comp_counter += counter_size; + } + + out->filled = 0; /* _Q_i will fill limits, curr puffetes, and populate pq + * on first call */ + out->counter_adj = 0; + out->pq_size = 0; + + u8 *reporters = (u8 *)out + m->reporter_offset; + + mmbit_clear(reporters, m->kilo_count); +} + +static really_inline +void mpvStoreState(const struct NFA *n, char *state, + const struct mpv_decomp_state *in) { + assert(ISALIGNED(in)); + const struct mpv *m = getImplNfa(n); + const struct mpv_counter_info *counter_info = get_counter_info(m); + + const u64a *counters = (const u64a *)((const char *)in + + get_counter_info(m)[0].counter_offset); + u64a adj = in->counter_adj; + char *comp_counter = state; + for (u32 i = 0; i < m->counter_count; i++) { + /* clamp counter to allow storage in smaller ints */ + u64a curr_counter = MIN(counters[i] + adj, counter_info[i].max_counter); + + u32 counter_size = counter_info[i].counter_size; + partial_store_u64a(comp_counter, curr_counter, counter_size); + DEBUG_PRINTF("stored %llu counter %u (orig %llu)\n", curr_counter, i, + counters[i]); + /* assert(counters[i] != MPV_DEAD_VALUE); /\* should have process 1 byte */ + /* * since a clear *\/ */ + comp_counter += counter_size; + } +} + +char nfaExecMpv_queueCompressState(const struct NFA *nfa, const struct mq *q, + UNUSED s64a loc) { + void *dest = q->streamState; + const void *src = q->state; + mpvStoreState(nfa, dest, src); + return 0; +} + +char nfaExecMpv_expandState(const struct NFA *nfa, void *dest, const void *src, + UNUSED u64a offset, UNUSED u8 key) { + mpvLoadState(dest, nfa, src); + return 0; +} + +char nfaExecMpv_reportCurrent(const struct NFA *n, struct mq *q) { + const struct mpv *m = getImplNfa(n); + u64a offset = q_cur_offset(q); + struct mpv_decomp_state *s = (struct mpv_decomp_state *)q->state; + + DEBUG_PRINTF("report current: offset %llu\n", offset); + + u8 *active = (u8 *)q->streamState + m->active_offset; + u32 rl_count = 0; + ReportID *rl = get_report_list(m, s); + + processReports(m, active, s, s->counter_adj, offset, q->cb, q->context, rl, + &rl_count); + return 0; +} + +char nfaExecMpv_queueInitState(const struct NFA *n, struct mq *q) { + struct mpv_decomp_state *out = (void *)q->state; + const struct mpv *m = getImplNfa(n); + assert(sizeof(*out) <= n->scratchStateSize); + + DEBUG_PRINTF("queue init state\n"); + + u64a *counters = get_counter_n(out, m, 0); + for (u32 i = 0; i < m->counter_count; i++) { + counters[i] = MPV_DEAD_VALUE; + } + + out->filled = 0; + out->counter_adj = 0; + out->pq_size = 0; + out->active[0].curr = NULL; + + assert(q->streamState); + u8 *active_kpuff = (u8 *)q->streamState + m->active_offset; + u8 *reporters = (u8 *)q->state + m->reporter_offset; + mmbit_clear(active_kpuff, m->kilo_count); + mmbit_clear(reporters, m->kilo_count); + return 0; +} + +char nfaExecMpv_initCompressedState(const struct NFA *n, u64a offset, + void *state, UNUSED u8 key) { + const struct mpv *m = getImplNfa(n); + memset(state, 0, m->active_offset); /* active_offset marks end of comp + * counters */ + u8 *active_kpuff = (u8 *)state + m->active_offset; + if (!offset) { + mmbit_init_range(active_kpuff, m->kilo_count, m->top_kilo_begin, + m->top_kilo_end); + return 1; + } else { + return 0; + } +} + +static really_inline +char nfaExecMpv_Q_i(const struct NFA *n, struct mq *q, s64a end) { + u64a offset = q->offset; + const u8 *buffer = q->buffer; + size_t length = q->length; + NfaCallback cb = q->cb; + void *context = q->context; + s64a sp; + const struct mpv *m = getImplNfa(n); + struct mpv_decomp_state *s = (struct mpv_decomp_state *)q->state; + u8 *active = (u8 *)q->streamState + m->active_offset; + u8 *reporters = (u8 *)q->state + m->reporter_offset; + struct mpv_pq_item *pq = (struct mpv_pq_item *)(q->state + m->pq_offset); + + if (!s->filled) { + fillLimits(m, active, reporters, s, pq, q->buffer, q->length); + } + + assert(!q->report_current); + + if (q->cur == q->end) { + return 1; + } + + assert(q->cur + 1 < q->end); /* require at least two items */ + + assert(q_cur_type(q) == MQE_START); + assert(q_cur_loc(q) >= 0); + sp = q->items[q->cur].location; + q->cur++; + + if (q->items[q->cur - 1].location > end) { + /* this is as far as we go */ + q->cur--; + q->items[q->cur].type = MQE_START; + q->items[q->cur].location = end; + return MO_ALIVE; + } + + while (q->cur < q->end) { + s64a ep = q->items[q->cur].location; + + ep = MIN(ep, end); + + assert(ep >= sp); + + assert(sp >= 0); /* mpv should be an outfix; outfixes are not lazy */ + + if (sp >= ep) { + goto scan_done; + } + + /* do main buffer region */ + assert((u64a)ep <= length); + char rv = mpvExec(m, active, reporters, s, pq, buffer, sp, ep, length, + offset, cb, context); + if (rv == MO_HALT_MATCHING) { + q->cur = q->end; + return 0; + } + + scan_done: + if (q->items[q->cur].location > end) { + /* this is as far as we go */ + q->cur--; + q->items[q->cur].type = MQE_START; + q->items[q->cur].location = end; + return MO_ALIVE; + } + + sp = ep; + + switch (q->items[q->cur].type) { + case MQE_TOP: + DEBUG_PRINTF("top %u %u\n", m->top_kilo_begin, m->top_kilo_end); + /* MQE_TOP initialise all counters to 0; activates all kilos */ + { + u64a *counters = get_counter_n(s, m, 0); + assert(counters[0] == MPV_DEAD_VALUE); + assert(!s->counter_adj); + for (u32 i = 0; i < m->counter_count; i++) { + counters[i] = 0; + } + mmbit_init_range(active, m->kilo_count, m->top_kilo_begin, + m->top_kilo_end); + fillLimits(m, active, reporters, s, pq, buffer, length); + } + break; + case MQE_START: + case MQE_END: + break; + default: + /* MQE_TOP_N --> switch on kilo puff N */ + assert(q->items[q->cur].type >= MQE_TOP_FIRST); + assert(q->items[q->cur].type < MQE_INVALID); + u32 i = q->items[q->cur].type - MQE_TOP_FIRST; + handleTopN(m, sp, active, reporters, s, pq, buffer, length, i); + break; + } + + q->cur++; + } + + char alive = 0; + assert(q->items[q->cur - 1].type == MQE_END); + if (q->items[q->cur - 1].location == (s64a)q->length) { + normalize_counters(s, m); + + const struct mpv_kilopuff *kp = (const struct mpv_kilopuff *)(m + 1); + for (u32 i = mmbit_iterate(active, m->kilo_count, MMB_INVALID); + i != MMB_INVALID; i = mmbit_iterate(active, m->kilo_count, i)) { + if (*get_counter_for_kilo(s, &kp[i]) >= kp[i].dead_point) { + mmbit_unset(active, m->kilo_count, i); + } else { + alive = 1; + } + } + } else { + alive + = mmbit_iterate(active, m->kilo_count, MMB_INVALID) != MMB_INVALID; + } + + DEBUG_PRINTF("finished %d\n", (int)alive); + return alive; +} + +char nfaExecMpv_Q(const struct NFA *n, struct mq *q, s64a end) { + DEBUG_PRINTF("_Q %lld\n", end); + return nfaExecMpv_Q_i(n, q, end); +} + +s64a nfaExecMpv_QueueExecRaw(const struct NFA *nfa, struct mq *q, s64a end) { + DEBUG_PRINTF("nfa=%p end=%lld\n", nfa, end); +#ifdef DEBUG + debugQueue(q); +#endif + + assert(nfa->type == MPV_NFA); + assert(q && q->context && q->state); + assert(end >= 0); + assert(q->cur < q->end); + assert(q->end <= MAX_MQE_LEN); + assert(ISALIGNED_16(nfa) && ISALIGNED_16(getImplNfa(nfa))); + assert(end < q->items[q->end - 1].location + || q->items[q->end - 1].type == MQE_END); + + if (q->items[q->cur].location > end) { + return 1; + } + + char q_trimmed = 0; + + assert(end <= (s64a)q->length || !q->hlength); + /* due to reverse accel in block mode some queues may work on a truncated + * buffer */ + if (end > (s64a)q->length) { + end = q->length; + q_trimmed = 1; + } + + /* TODO: restore max offset stuff, if/when _interesting_ max offset stuff + * is filled in */ + + char rv = nfaExecMpv_Q_i(nfa, q, end); + + assert(!q->report_current); + DEBUG_PRINTF("returned rv=%d, q_trimmed=%d\n", rv, q_trimmed); + if (q_trimmed || !rv) { + return 0; + } else { + const struct mpv *m = getImplNfa(nfa); + u8 *reporters = (u8 *)q->state + m->reporter_offset; + + if (mmbit_any_precise(reporters, m->kilo_count)) { + DEBUG_PRINTF("next byte\n"); + return 1; /* need to match at next byte */ + } else { + s64a next_event = q->length; + s64a next_pq = q->length; + + if (q->cur < q->end) { + next_event = q->items[q->cur].location; + } + + struct mpv_decomp_state *s = (struct mpv_decomp_state *)q->state; + struct mpv_pq_item *pq + = (struct mpv_pq_item *)(q->state + m->pq_offset); + if (s->pq_size) { + next_pq = pq_top(pq)->trigger_loc; + } + + assert(next_event); + assert(next_pq); + + DEBUG_PRINTF("next pq %lld event %lld\n", next_pq, next_event); + return MIN(next_pq, next_event); + } + } +} diff --git a/regex/nfa/mpv.h b/regex/nfa/mpv.h new file mode 100644 index 000000000..3780728d7 --- /dev/null +++ b/regex/nfa/mpv.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2015-2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef MPV_H +#define MPV_H + +#include "ue2common.h" + +struct mq; +struct NFA; + +char nfaExecMpv_Q(const struct NFA *n, struct mq *q, s64a end); +char nfaExecMpv_reportCurrent(const struct NFA *n, struct mq *q); +char nfaExecMpv_queueInitState(const struct NFA *n, struct mq *q); +char nfaExecMpv_initCompressedState(const struct NFA *n, u64a offset, + void *state, u8 key); +char nfaExecMpv_queueCompressState(const struct NFA *nfa, const struct mq *q, + s64a loc); +char nfaExecMpv_expandState(const struct NFA *nfa, void *dest, const void *src, + u64a offset, u8 key); + +#define nfaExecMpv_testEOD NFA_API_NO_IMPL +#define nfaExecMpv_inAccept NFA_API_NO_IMPL +#define nfaExecMpv_inAnyAccept NFA_API_NO_IMPL +#define nfaExecMpv_QR NFA_API_NO_IMPL +#define nfaExecMpv_Q2 NFA_API_NO_IMPL /* for non-chained suffixes. */ +#define nfaExecMpv_B_Reverse NFA_API_NO_IMPL +#define nfaExecMpv_zombie_status NFA_API_ZOMBIE_NO_IMPL + +/** + * return 0 if the mpv dies, otherwise returns the location of the next possible + * match (given the currently known events). */ +s64a nfaExecMpv_QueueExecRaw(const struct NFA *nfa, struct mq *q, s64a end); + +#endif diff --git a/regex/nfa/mpv_internal.h b/regex/nfa/mpv_internal.h new file mode 100644 index 000000000..a52853dce --- /dev/null +++ b/regex/nfa/mpv_internal.h @@ -0,0 +1,197 @@ +/* + * Copyright (c) 2015-2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef MPV_INTERNAL_H +#define MPV_INTERNAL_H + +#include "ue2common.h" + +#define MPV_DOT 0 +#define MPV_VERM 1 +#define MPV_SHUFTI 2 +#define MPV_TRUFFLE 3 +#define MPV_NVERM 4 + +struct mpv_puffette { + u32 repeats; + char unbounded; + + /** + * \brief Report is simple-exhaustible. + * + * If this is true, we do best-effort suppression of runs of reports, only + * delivering the first one. + */ + char simple_exhaust; + + ReportID report; +}; + +struct mpv_kilopuff { + u32 counter_offset; /**< offset (in full stream state) to the counter that + * this kilopuff refers to */ + u32 count; /**< number of real (non sentinel mpv puffettes) */ + u32 puffette_offset; /**< relative to base of mpv, points past the 1st + * sent */ + u64a dead_point; + u8 auto_restart; + u8 type; /* MPV_DOT, MPV_VERM, etc */ + union { + struct { + char c; + } verm; + struct { + m128 mask_lo; + m128 mask_hi; + } shuf; + struct { + m128 mask1; + m128 mask2; + } truffle; + } u; +}; + +struct mpv_counter_info { + u64a max_counter; /**< maximum value this counter needs to track */ + u32 counter_size; /**< number of bytes to represent the counter in stream + * state */ + u32 counter_offset; /**< offset that this counter is stored at in the + * full stream state */ + u32 kilo_begin; /**< first kilo to turn on when the counter is started */ + u32 kilo_end; /**< 1 + last kilo to turn on when the counter is started */ +}; + +struct ALIGN_AVX_DIRECTIVE mpv { + u32 kilo_count; /**< number of kilopuffs following */ + u32 counter_count; /**< number of counters managed by the mpv */ + u32 puffette_count; /**< total number of puffettes under all the kilos */ + u32 pq_offset; /**< offset to the priority queue in the decompressed + * state */ + u32 reporter_offset; /**< offset to the reporter mmbit in the decompressed + * state */ + u32 report_list_offset; /**< offset to the report list scratch space in the + * decompressed state */ + u32 active_offset; /**< offset to the active kp mmbit in the compressed + * state */ + u32 top_kilo_begin; /**< first kilo to switch on when top arrives */ + u32 top_kilo_end; /**< one past the last kilo to switch on when top + * arrives */ +}; + +struct mpv_decomp_kilo { + u64a limit; + const struct mpv_puffette *curr; +}; + +/* note: size varies on different platforms */ +struct mpv_decomp_state { + u32 pq_size; + char filled; + u64a counter_adj; /**< progress not yet written to the real counters */ + struct mpv_decomp_kilo active[]; +}; + +/* --- + * | | mpv + * --- + * | | + * | | kilo_count * mpv_kilopuffs + * | | + * ... + * | | + * --- + * | | + * | | counter_count * mpv_counter_infos + * | | + * ... + * | | + * --- + * | | sentinel mpv_puffette + * --- + * | | mpv_puffettes for 1st kilopuff + * | | (mpv_puffettes are ordered by minimum number of repeats) + * | | + * --- + * | | sentinel mpv_puffette + * --- + * | | mpv_puffettes for 2nd kilopuff + * ... + * | | + * --- + * | | sentinel mpv_puffette + * --- + */ + +/* + * Stream State + * [Compressed Counter 0] + * [Compressed Counter 1] + * ... + * [Compressed Counter N] + * [mmbit of active kilopuffs] + * + * Decompressed State + * [header (limit pq_size)] + * [ + * [kilo 1 current reports] + * ... + * [kilo N current reports] + * ] + * [ + * [Full Counter 0] + * [Full Counter 1] + * ... + * [Full Counter N] + * ] + * [pq of kilo changes] + * [scratch space for current report lists (total number of puffettes)] + * [mmbit of kilopuffs with active reports] + */ + +struct mpv_pq_item { + u64a trigger_loc; + u32 kilo; +}; + +/* returns pointer to first non sentinel mpv_puff */ +static really_inline +const struct mpv_puffette *get_puff_array(const struct mpv *m, + const struct mpv_kilopuff *kp) { + return (const struct mpv_puffette *)((const char *)m + kp->puffette_offset); +} + +static really_inline +const struct mpv_counter_info *get_counter_info(const struct mpv *m) { + return (const struct mpv_counter_info *)((const char *)(m + 1) + + m->kilo_count * sizeof(struct mpv_kilopuff)); +} + +#define MPV_DEAD_VALUE (~0ULL) +#define INVALID_REPORT (~0U) + +#endif diff --git a/regex/nfa/nfa_api.h b/regex/nfa/nfa_api.h new file mode 100644 index 000000000..e3f7f7431 --- /dev/null +++ b/regex/nfa/nfa_api.h @@ -0,0 +1,280 @@ +/* + * Copyright (c) 2015-2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Declarations for the main NFA Engine API. + * + * This file provides the internal API for all runtime engines ("NFAs", even if + * they're not strictly NFA implementations). + */ + +#ifndef NFA_API_H +#define NFA_API_H + +#ifdef __cplusplus +extern "C" +{ +#endif + +#include "callback.h" +#include "ue2common.h" + +struct mq; +struct NFA; + +/** + * Indicates if an nfa is a zombie. Note: that there were plans for a more + * nuanced view of zombiehood but this never eventuated. + */ +enum nfa_zombie_status { + NFA_ZOMBIE_NO, /**< nfa is not a zombie and will respond to top events */ + NFA_ZOMBIE_ALWAYS_YES /**< nfa is a zombie and will always be a zombie */ +}; + +/** + * Compresses an engine's state. + * The expanded state (@ref mq::state, @ref mq::streamState) is reduced purely + * to a corresponding compressed stream state (@ref mq::streamState). + * + * @param nfa engine the state belongs to + * @param q queue for the engine. The final compressed stream stream is placed + * in the location indicated by @ref mq::streamState + * @param loc the location corresponding to the engine's current state + */ +char nfaQueueCompressState(const struct NFA *nfa, const struct mq *q, s64a loc); + +/** + * Expands an engine's compressed stream state, into its scratch space + * representation. This is required before an engine starts operating over its + * queue. + * + * @param nfa engine the state belongs to + * @param dest location in scratch for decompressed state + * @param src compressed stream state + * @param offset the current stream offset. + * @param key byte corresponding to the location where the compressed state was + * created. + */ +char nfaExpandState(const struct NFA *nfa, void *dest, const void *src, + u64a offset, u8 key); + +/** + * Gives us a properly initialised dead state suitable for later @ref + * nfaQueueExec calls. + */ +char nfaQueueInitState(const struct NFA *nfa, struct mq *q); + +/** + * Initialise the state, applying a TOP appropriate for the offset. If the + * NFA becomes inactive, return zero. Otherwise, write out its compressed + * representation to `state' and return non-zero. + * + * @param nfa engine the state belongs to + * @param offset offset in the stream (relative to start of stream) + * @param state pointer indicating where the state is to be written + * @param key byte corresponding to the location where the compressed state is + * to be created. + */ +char nfaInitCompressedState(const struct NFA *nfa, u64a offset, void *state, + u8 key); + +/** + * Process the queued commands on the given NFA. + * + * @param nfa the NFA to execute + * @param q the queued commands. It must start with some variant of start and + * end with some variant of end. The location field of the events must + * be monotonically increasing. + * @param end stop processing command queue when we reach this point + * + * @return non-zero if the nfa is still active, if the nfa is not active the + * state data is undefined + * + * Note: this function can not process events from the past: the location field + * of each event must be >= current offset. + */ +char nfaQueueExec(const struct NFA *nfa, struct mq *q, s64a end); + +/** + * Main execution function that doesn't perform the checks and optimisations of + * nfaQueueExec() and just dispatches directly to the nfa implementations. It is + * intended to be used by the Tamarama engine. + */ +char nfaQueueExec_raw(const struct NFA *nfa, struct mq *q, s64a end); + +/** Return value indicating that the engine is dead. */ +#define MO_DEAD 0 + +/** Return value indicating that the engine is alive. */ +#define MO_ALIVE 1 + +/** Return value from @ref nfaQueueExecToMatch indicating that engine progress + * stopped as a match state was reached. */ +#define MO_MATCHES_PENDING 2 + +/** + * Process the queued commands on the given nfa up to end or the first match. + * This function will only fire the callback in response to an report_current + * being set and accepts at the starting offset, in all other situations accepts + * will result in the queue pausing with a return value of + * @ref MO_MATCHES_PENDING. + * + * @param nfa the NFA to execute + * @param q the queued commands. It must start with some variant of start and + * end with some variant of end. The location field of the events must + * be monotonically increasing. If not all the data was processed during + * the call, the queue is updated to reflect the remaining work. + * @param end stop processing command queue when we reach this point + * + * @return @ref MO_ALIVE if the nfa is still active with no matches pending, + * and @ref MO_MATCHES_PENDING if there are matches pending, 0 if not + * alive + * + * Note: if it can be determined that the stream can never match, the nfa + * may be reported as dead even if not all the data was scanned + * + * Note: if the nfa is not alive the state data is undefined + * + * Note: this function can not process events from the past: the location field + * of each event must be >= current offset. + */ +char nfaQueueExecToMatch(const struct NFA *nfa, struct mq *q, s64a end); + +/** + * Main execution function that doesn't perform the checks and optimisations of + * nfaQueueExecToMatch() and just dispatches directly to the nfa + * implementations. It is intended to be used by the Tamarama engine. + */ +char nfaQueueExec2_raw(const struct NFA *nfa, struct mq *q, s64a end); + +/** + * Report matches at the current queue location. + * + * @param nfa the NFA to execute + * @param q the queued commands. It must start with some variant of start and + * end with some variant of end. The location field of the events must + * be monotonically increasing. + * + * Note: the queue MUST be located at position where @ref nfaQueueExecToMatch + * returned @ref MO_MATCHES_PENDING. + * + * Note: the return value of this call is undefined, and should be ignored. + */ +char nfaReportCurrentMatches(const struct NFA *nfa, struct mq *q); + +/** + * Returns non-zero if the NFA is in an accept state with the given report ID. + */ +char nfaInAcceptState(const struct NFA *nfa, ReportID report, struct mq *q); + +/** + * Returns non-zero if the NFA is in any accept state regardless of report + * ID. + */ +char nfaInAnyAcceptState(const struct NFA *nfa, struct mq *q); + +/** + * Process the queued commands on the given NFA up to end or the first match. + * + * Note: This version is meant for rose prefix/infix NFAs: + * - never uses a callback + * - loading of state at a point in history is not special cased + * + * @param nfa the NFA to execute + * @param q the queued commands. It must start with some variant of start and + * end with some variant of end. The location field of the events must + * be monotonically increasing. If not all the data was processed during + * the call, the queue is updated to reflect the remaining work. + * @param report we are interested in. If the given report will be raised at + * the end location, the function returns @ref MO_MATCHES_PENDING. If no + * match information is desired, MO_INVALID_IDX should be passed in. + * @return @ref MO_ALIVE if the nfa is still active with no matches pending, + * and @ref MO_MATCHES_PENDING if there are matches pending, 0 if not + * alive + * + * Note: if it can be determined that the stream can never match, the nfa + * may be reported as dead even if not all the data was scanned + * + * Note: if the NFA is not active the state data is undefined. + */ +char nfaQueueExecRose(const struct NFA *nfa, struct mq *q, ReportID report); + +/** + * Runs an NFA in reverse from (buf + buflen) to buf and then from (hbuf + hlen) + * to hbuf (main buffer and history buffer). + * + * Note: provides the match location as the "end" offset when the callback is + * called. + * + * @param nfa engine to run + * @param offset base offset of buf + * @param buf main buffer + * @param buflen length of buf + * @param hbuf history buf + * @param hlen length of hbuf + * @param callback the callback to call for each match raised + * @param context context pointer passed to each callback + */ +char nfaBlockExecReverse(const struct NFA *nfa, u64a offset, const u8 *buf, + size_t buflen, const u8 *hbuf, size_t hlen, + NfaCallback callback, void *context); + +/** + * Check whether the given NFA's state indicates that it is in one or more + * final (accept at end of data) state. If so, call the callback for each + * match. + * + * @param nfa the NFA to execute + * @param state current state associated with this NFA + * @param streamState stream version of the state associated with this NFA + * (including br region) + * @param offset the offset to return (via the callback) with each match + * @param callback the callback to call for each match raised + * @param context context pointer passed to each callback + * + * @return @ref MO_HALT_MATCHING if the user instructed us to halt, otherwise + * @ref MO_CONTINUE_MATCHING. + */ +char nfaCheckFinalState(const struct NFA *nfa, const char *state, + const char *streamState, u64a offset, + NfaCallback callback, void *context); + +/** + * Indicates if an engine is a zombie. + * + * @param nfa engine to consider + * @param q queue corresponding to the engine + * @param loc current location in the buffer for an engine + */ +enum nfa_zombie_status nfaGetZombieStatus(const struct NFA *nfa, struct mq *q, + s64a loc); +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif diff --git a/regex/nfa/nfa_api_dispatch.c b/regex/nfa/nfa_api_dispatch.c new file mode 100644 index 000000000..75cac4b48 --- /dev/null +++ b/regex/nfa/nfa_api_dispatch.c @@ -0,0 +1,368 @@ +/* + * Copyright (c) 2015-2020, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + \brief Dispatches NFA engine API calls to the appropriate engines +*/ +#include "nfa_api.h" + +#include "nfa_api_queue.h" +#include "nfa_internal.h" +#include "ue2common.h" + +// Engine implementations. +#include "castle.h" +#include "gough.h" +#include "lbr.h" +#include "limex.h" +#include "mcclellan.h" +#include "mcsheng.h" +#include "mpv.h" +#include "sheng.h" +#include "tamarama.h" + +#define DISPATCH_CASE(dc_ltype, dc_ftype, dc_func_call) \ + case dc_ltype: \ + return nfaExec##dc_ftype##dc_func_call; \ + break + +// general framework calls + +#define DISPATCH_BY_NFA_TYPE(dbnt_func) \ + switch (nfa->type) { \ + DISPATCH_CASE(LIMEX_NFA_32, LimEx32, dbnt_func); \ + DISPATCH_CASE(LIMEX_NFA_64, LimEx64, dbnt_func); \ + DISPATCH_CASE(LIMEX_NFA_128, LimEx128, dbnt_func); \ + DISPATCH_CASE(LIMEX_NFA_256, LimEx256, dbnt_func); \ + DISPATCH_CASE(LIMEX_NFA_384, LimEx384, dbnt_func); \ + DISPATCH_CASE(LIMEX_NFA_512, LimEx512, dbnt_func); \ + DISPATCH_CASE(MCCLELLAN_NFA_8, McClellan8, dbnt_func); \ + DISPATCH_CASE(MCCLELLAN_NFA_16, McClellan16, dbnt_func); \ + DISPATCH_CASE(GOUGH_NFA_8, Gough8, dbnt_func); \ + DISPATCH_CASE(GOUGH_NFA_16, Gough16, dbnt_func); \ + DISPATCH_CASE(MPV_NFA, Mpv, dbnt_func); \ + DISPATCH_CASE(LBR_NFA_DOT, LbrDot, dbnt_func); \ + DISPATCH_CASE(LBR_NFA_VERM, LbrVerm, dbnt_func); \ + DISPATCH_CASE(LBR_NFA_NVERM, LbrNVerm, dbnt_func); \ + DISPATCH_CASE(LBR_NFA_SHUF, LbrShuf, dbnt_func); \ + DISPATCH_CASE(LBR_NFA_TRUF, LbrTruf, dbnt_func); \ + DISPATCH_CASE(CASTLE_NFA, Castle, dbnt_func); \ + DISPATCH_CASE(SHENG_NFA, Sheng, dbnt_func); \ + DISPATCH_CASE(TAMARAMA_NFA, Tamarama, dbnt_func); \ + DISPATCH_CASE(MCSHENG_NFA_8, McSheng8, dbnt_func); \ + DISPATCH_CASE(MCSHENG_NFA_16, McSheng16, dbnt_func); \ + DISPATCH_CASE(SHENG_NFA_32, Sheng32, dbnt_func); \ + DISPATCH_CASE(SHENG_NFA_64, Sheng64, dbnt_func); \ + DISPATCH_CASE(MCSHENG_64_NFA_8, McSheng64_8, dbnt_func); \ + DISPATCH_CASE(MCSHENG_64_NFA_16, McSheng64_16, dbnt_func); \ + default: \ + assert(0); \ + } + +char nfaCheckFinalState(const struct NFA *nfa, const char *state, + const char *streamState, u64a offset, + NfaCallback callback, void *context) { + assert(ISALIGNED_CL(nfa) && ISALIGNED_CL(getImplNfa(nfa))); + + // Caller should avoid calling us if we can never produce matches. + assert(nfaAcceptsEod(nfa)); + + DISPATCH_BY_NFA_TYPE(_testEOD(nfa, state, streamState, offset, callback, + context)); + return 0; +} + +char nfaQueueInitState(const struct NFA *nfa, struct mq *q) { + assert(ISALIGNED_CL(nfa) && ISALIGNED_CL(getImplNfa(nfa))); + + DISPATCH_BY_NFA_TYPE(_queueInitState(nfa, q)); + return 0; +} + +static really_inline +char nfaQueueExec_i(const struct NFA *nfa, struct mq *q, s64a end) { + DISPATCH_BY_NFA_TYPE(_Q(nfa, q, end)); + return 0; +} + +static really_inline +char nfaQueueExec2_i(const struct NFA *nfa, struct mq *q, s64a end) { + DISPATCH_BY_NFA_TYPE(_Q2(nfa, q, end)); + return 0; +} + +char nfaQueueExec_raw(const struct NFA *nfa, struct mq *q, s64a end) { + return nfaQueueExec_i(nfa, q, end); +} + +char nfaQueueExec2_raw(const struct NFA *nfa, struct mq *q, s64a end) { + return nfaQueueExec2_i(nfa, q, end); +} + +static really_inline +char nfaQueueExecRose_i(const struct NFA *nfa, struct mq *q, ReportID report) { + DISPATCH_BY_NFA_TYPE(_QR(nfa, q, report)); + return 0; +} + +/** Returns 0 if this NFA cannot possibly match (due to width constraints etc) + * and the caller should return 0. May also edit the queue. */ +static really_inline +char nfaQueueCanMatch(const struct NFA *nfa, struct mq *q, s64a end, + char *q_trimmed) { + assert(q_trimmed); + assert(q->end - q->cur >= 2); + assert(end >= 0); + + DEBUG_PRINTF("q->offset=%llu, end=%lld\n", q->offset, end); + DEBUG_PRINTF("maxBiAnchoredWidth=%u, maxOffset=%u\n", + nfa->maxBiAnchoredWidth, nfa->maxOffset); + + if (nfa->maxBiAnchoredWidth && + (end + q->offset > nfa->maxBiAnchoredWidth)) { + DEBUG_PRINTF("stream too long: o %llu l %zu max: %hhu\n", q->offset, + q->length, nfa->maxBiAnchoredWidth); + return 0; + } + + if (nfa->maxOffset) { + if (q->offset >= nfa->maxOffset) { + DEBUG_PRINTF("stream is past maxOffset\n"); + return 0; + } + + if (q->offset + end > nfa->maxOffset) { + s64a maxEnd = nfa->maxOffset - q->offset; + DEBUG_PRINTF("me %lld off %llu len = %lld\n", maxEnd, + q->offset, end); + while (q->end > q->cur + && q->items[q->end - 1].location > maxEnd) { + *q_trimmed = 1; + DEBUG_PRINTF("killing item %u %lld %u\n", q->end, + q->items[q->end - 1].location, + q->items[q->end - 1].type); + q->items[q->end - 1].location = maxEnd; + q->items[q->end - 1].type = MQE_END; + if (q->end - q->cur < 2 + ||q->items[q->end - 2].location <= maxEnd) { + break; + } + q->end--; + } + + if (q->end - q->cur < 2) { /* nothing left on q */ + DEBUG_PRINTF("queue empty\n"); + return 0; + } + } + +#ifdef DEBUG + if (*q_trimmed) { + debugQueue(q); + } +#endif + } + + return 1; +} + +char nfaQueueExec(const struct NFA *nfa, struct mq *q, s64a end) { + DEBUG_PRINTF("nfa=%p end=%lld\n", nfa, end); +#ifdef DEBUG + debugQueue(q); +#endif + + assert(q && q->context && q->state); + assert(end >= 0); + assert(q->cur < q->end); + assert(q->end <= MAX_MQE_LEN); + assert(ISALIGNED_CL(nfa) && ISALIGNED_CL(getImplNfa(nfa))); + assert(end < q->items[q->end - 1].location + || q->items[q->end - 1].type == MQE_END); + + if (q->items[q->cur].location > end) { + return 1; + } + + char q_trimmed = 0; + + assert(end <= (s64a)q->length || !q->hlength); + /* due to reverse accel in block mode some queues may work on a truncated + * buffer */ + if (end > (s64a)q->length) { + end = q->length; + q_trimmed = 1; + } + + if (!nfaQueueCanMatch(nfa, q, end, &q_trimmed)) { + if (q->report_current) { + nfaReportCurrentMatches(nfa, q); + q->report_current = 0; + } + + return 0; + } + + char rv = nfaQueueExec_i(nfa, q, end); + +#ifdef DEBUG + debugQueue(q); +#endif + + assert(!q->report_current); + DEBUG_PRINTF("returned rv=%d, q_trimmed=%d\n", rv, q_trimmed); + return rv && !q_trimmed; +} + +char nfaQueueExecToMatch(const struct NFA *nfa, struct mq *q, s64a end) { + DEBUG_PRINTF("nfa=%p end=%lld\n", nfa, end); +#ifdef DEBUG + debugQueue(q); +#endif + + assert(q); + assert(end >= 0); + assert(q->state); + assert(q->cur < q->end); + assert(q->end <= MAX_MQE_LEN); + assert(ISALIGNED_CL(nfa) && ISALIGNED_CL(getImplNfa(nfa))); + assert(end < q->items[q->end - 1].location + || q->items[q->end - 1].type == MQE_END); + + char q_trimmed_ra = 0; + assert(end <= (s64a)q->length || !q->hlength); + /* due to reverse accel in block mode some queues may work on a truncated + * buffer */ + if (q->items[q->cur].location > end) { + return 1; + } + + if (end > (s64a)q->length) { + end = q->length; + q_trimmed_ra = 1; + } + + char q_trimmed = 0; + if (!nfaQueueCanMatch(nfa, q, end, &q_trimmed)) { + if (q->report_current) { + nfaReportCurrentMatches(nfa, q); + q->report_current = 0; + } + + return 0; + } + + char rv = nfaQueueExec2_i(nfa, q, end); + assert(!q->report_current); + DEBUG_PRINTF("returned rv=%d, q_trimmed=%d\n", rv, q_trimmed); + if (rv == MO_MATCHES_PENDING) { + if (q_trimmed) { + // We need to "fix" the queue so that subsequent operations must + // trim it as well. + assert(q->end > 0); + assert(nfa->maxOffset); + q->items[q->end - 1].location = nfa->maxOffset + 1; + } + return rv; + } + return rv && !q_trimmed && !q_trimmed_ra; +} + +char nfaReportCurrentMatches(const struct NFA *nfa, struct mq *q) { + DISPATCH_BY_NFA_TYPE(_reportCurrent(nfa, q)); + return 0; +} + +char nfaInAcceptState(const struct NFA *nfa, ReportID report, struct mq *q) { + DISPATCH_BY_NFA_TYPE(_inAccept(nfa, report, q)); + return 0; +} + +char nfaInAnyAcceptState(const struct NFA *nfa, struct mq *q) { + DISPATCH_BY_NFA_TYPE(_inAnyAccept(nfa, q)); + return 0; +} + +char nfaQueueExecRose(const struct NFA *nfa, struct mq *q, ReportID r) { + DEBUG_PRINTF("nfa=%p\n", nfa); +#ifdef DEBUG + debugQueue(q); +#endif + + assert(q && !q->context && q->state); + assert(q->cur <= q->end); + assert(q->end <= MAX_MQE_LEN); + assert(ISALIGNED_CL(nfa) && ISALIGNED_CL(getImplNfa(nfa))); + assert(!q->report_current); + + return nfaQueueExecRose_i(nfa, q, r); +} + +char nfaBlockExecReverse(const struct NFA *nfa, u64a offset, const u8 *buf, + size_t buflen, const u8 *hbuf, size_t hlen, + NfaCallback callback, void *context) { + assert(nfa); + assert(ISALIGNED_CL(nfa) && ISALIGNED_CL(getImplNfa(nfa))); + + DISPATCH_BY_NFA_TYPE(_B_Reverse(nfa, offset, buf, buflen, hbuf, hlen, + callback, context)); + return 0; +} + +char nfaQueueCompressState(const struct NFA *nfa, const struct mq *q, + s64a loc) { + assert(nfa && q); + assert(ISALIGNED_CL(nfa) && ISALIGNED_CL(getImplNfa(nfa))); + + DISPATCH_BY_NFA_TYPE(_queueCompressState(nfa, q, loc)); + return 0; +} + +char nfaExpandState(const struct NFA *nfa, void *dest, const void *src, + u64a offset, u8 key) { + assert(nfa && dest && src); + assert(ISALIGNED_CL(nfa) && ISALIGNED_CL(getImplNfa(nfa))); + + DISPATCH_BY_NFA_TYPE(_expandState(nfa, dest, src, offset, key)); + return 0; +} + +char nfaInitCompressedState(const struct NFA *nfa, u64a offset, void *state, + u8 key) { + assert(nfa && state); + assert(ISALIGNED_CL(nfa) && ISALIGNED_CL(getImplNfa(nfa))); + + DISPATCH_BY_NFA_TYPE(_initCompressedState(nfa, offset, state, key)); + return 0; +} + +enum nfa_zombie_status nfaGetZombieStatus(const struct NFA *nfa, struct mq *q, + s64a loc) { + DISPATCH_BY_NFA_TYPE(_zombie_status(nfa, q, loc)); + return NFA_ZOMBIE_NO; +} diff --git a/regex/nfa/nfa_api_queue.h b/regex/nfa/nfa_api_queue.h new file mode 100644 index 000000000..e3579a7ee --- /dev/null +++ b/regex/nfa/nfa_api_queue.h @@ -0,0 +1,289 @@ +/* + * Copyright (c) 2015-2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef NFA_API_QUEUE_H +#define NFA_API_QUEUE_H + +#ifdef __cplusplus +extern "C" +{ +#endif + +#include "ue2common.h" +#include "callback.h" + +/** Size of mq::items, max elements on a queue. */ +#define MAX_MQE_LEN 10 + +/** Queue events */ + +/** Queue event: begin scanning. Note: stateless engines will start from this + * location. */ +#define MQE_START 0U + +/** Queue event: stop scanning. */ +#define MQE_END 1U + +/** Queue event: enable start and start-dot-star. */ +#define MQE_TOP 2U + +/** Queue event: first event corresponding to a numbered TOP. Additional tops + * (in multi-top engines) use the event values from MQE_TOP_FIRST to + * MQE_INVALID - 1. */ +#define MQE_TOP_FIRST 4U + +/** Invalid queue event */ +#define MQE_INVALID (~0U) + +/** Queue item */ +struct mq_item { + u32 type; /**< event type, from MQE_* */ + s64a location; /**< relative to the start of the current buffer */ + u64a som; /**< pattern start-of-match corresponding to a top, only used + * by som engines. */ +}; + +// Forward decl. +struct NFA; + +/** + * Queue of events to control engine execution. mq::cur is index of first + * valid event, mq::end is one past the index of last valid event. + */ +struct mq { + const struct NFA *nfa; /**< nfa corresponding to the queue */ + u32 cur; /**< index of the first valid item in the queue */ + u32 end; /**< index one past the last valid item in the queue */ + char *state; /**< uncompressed stream state; lives in scratch */ + char *streamState; /**< + * real stream state; used to access structures which + * not duplicated the scratch state (bounded repeats, + * etc) */ + u64a offset; /**< base offset of the buffer */ + const u8 *buffer; /**< buffer to scan */ + size_t length; /**< length of buffer */ + const u8 *history; /**< + * history buffer; (logically) immediately before the + * main buffer */ + size_t hlength; /**< length of the history buffer */ + struct hs_scratch *scratch; /**< global scratch space */ + char report_current; /**< + * report_current matches at starting offset through + * callback. If true, the queue must be located at a + * point where MO_MATCHES_PENDING was returned */ + NfaCallback cb; /**< callback to trigger on matches */ + void *context; /**< context to pass along with a callback */ + struct mq_item items[MAX_MQE_LEN]; /**< queue items */ +}; + + +/** + * Pushes an (event, location, som) item onto a queue. If it is identical to the + * previous item on the queue, it is not added to the queue. + * @param q queue + * @param e event + * @param som som marker + * @param loc event location + */ +static really_inline +void pushQueueSom(struct mq * restrict q, u32 e, s64a loc, u64a som) { + DEBUG_PRINTF("pushing %u@%lld -> %u [som = %llu]\n", e, loc, q->end, som); + assert(q->end < MAX_MQE_LEN); + assert(e < MQE_INVALID); +/* stop gcc getting too smart for its own good */ +/* assert(!q->end || q->items[q->end - 1].location <= loc); */ + assert(q->end || e == MQE_START); + + // Avoid duplicate items on the queue. + if (q->end) { + struct mq_item *item = &q->items[q->end - 1]; + if (item->type == e && item->location == loc) { + DEBUG_PRINTF("dropping duplicate item\n"); + LIMIT_TO_AT_MOST(&item->som, som); /* take lower som */ + return; + } + } + + u32 end = q->end; + struct mq_item *item = &q->items[end]; + item->type = e; + item->location = loc; + item->som = som; + q->end = end + 1; +} + +/** + * Pushes an (event, location) item onto a queue. If it is identical to the + * previous item on the queue, it is not added to the queue. + * @param q queue + * @param e event + * @param loc event location + */ +static really_inline +void pushQueue(struct mq * restrict q, u32 e, s64a loc) { + pushQueueSom(q, e, loc, 0); +} + +/** + * Pushes an (event, location) item onto a queue. + * This version of @ref pushQueue does not check to ensure that the item being + * added is not already on the queue. Used for events other than tops. + */ +static really_inline +void pushQueueNoMerge(struct mq * restrict q, u32 e, s64a loc) { + DEBUG_PRINTF("pushing %u@%lld -> %u\n", e, loc, q->end); + assert(q->end < MAX_MQE_LEN); + assert(e < MQE_INVALID); +/* stop gcc getting too smart for its own good */ +/* assert(!q->end || q->items[q->end - 1].location <= loc); */ + assert(q->end || e == MQE_START); + +#ifndef NDEBUG + // We assert that the event is different from its predecessor. If it's a + // dupe, you should have used the ordinary pushQueue call. + if (q->end) { + UNUSED struct mq_item *prev = &q->items[q->end - 1]; + assert(prev->type != e || prev->location != loc); + } +#endif + + u32 end = q->end; + struct mq_item *item = &q->items[end]; + item->type = e; + item->location = loc; + item->som = 0; + q->end = end + 1; +} + +/** \brief Returns the type of the current queue event. */ +static really_inline u32 q_cur_type(const struct mq *q) { + assert(q->cur < q->end); + assert(q->cur < MAX_MQE_LEN); + return q->items[q->cur].type; +} + +/** \brief Returns the location (relative to the beginning of the current data + * buffer) of the current queue event. */ +static really_inline s64a q_cur_loc(const struct mq *q) { + assert(q->cur < q->end); + assert(q->cur < MAX_MQE_LEN); + return q->items[q->cur].location; +} + +/** \brief Returns the type of the last event in the queue. */ +static really_inline u32 q_last_type(const struct mq *q) { + assert(q->cur < q->end); + assert(q->end > 0); + assert(q->end <= MAX_MQE_LEN); + return q->items[q->end - 1].type; +} + +/** \brief Returns the location (relative to the beginning of the current data + * buffer) of the last event in the queue. */ +static really_inline s64a q_last_loc(const struct mq *q) { + assert(q->cur < q->end); + assert(q->end > 0); + assert(q->end <= MAX_MQE_LEN); + return q->items[q->end - 1].location; +} + +/** \brief Returns the absolute stream offset of the current queue event. */ +static really_inline u64a q_cur_offset(const struct mq *q) { + assert(q->cur < q->end); + assert(q->cur < MAX_MQE_LEN); + return q->offset + (u64a)q->items[q->cur].location; +} + +/** + * \brief Removes all events in the queue before the given location. + */ +static really_inline +void q_skip_forward_to(struct mq *q, s64a min_loc) { + assert(q->cur < q->end); + assert(q->cur < MAX_MQE_LEN); + assert(q->items[q->cur].type == MQE_START); + + if (q_cur_loc(q) >= min_loc) { + DEBUG_PRINTF("all events >= loc %lld\n", min_loc); + return; + } + + const u32 start_loc = q->cur; + + do { + DEBUG_PRINTF("remove item with loc=%lld\n", q_cur_loc(q)); + q->cur++; + } while (q->cur < q->end && q_cur_loc(q) < min_loc); + + if (q->cur > start_loc) { + // Move original MQE_START item forward. + q->cur--; + q->items[q->cur] = q->items[start_loc]; + } +} + +#ifdef DEBUG +// Dump the contents of the given queue. +static never_inline UNUSED +void debugQueue(const struct mq *q) { + DEBUG_PRINTF("q=%p, nfa=%p\n", q, q->nfa); + DEBUG_PRINTF("q offset=%llu, buf={%p, len=%zu}, history={%p, len=%zu}\n", + q->offset, q->buffer, q->length, q->history, q->hlength); + DEBUG_PRINTF("q cur=%u, end=%u\n", q->cur, q->end); + for (u32 cur = q->cur; cur < q->end; cur++) { + const char *type = "UNKNOWN"; + u32 e = q->items[cur].type; + switch (e) { + case MQE_START: + type = "MQE_START"; + break; + case MQE_END: + type = "MQE_END"; + break; + case MQE_TOP: + type = "MQE_TOP"; + break; + case MQE_INVALID: + type = "MQE_INVALID"; + break; + default: + assert(e >= MQE_TOP_FIRST && e < MQE_INVALID); + type = "MQE_TOP_N"; + break; + } + DEBUG_PRINTF("\tq[%u] %lld %u:%s\n", cur, q->items[cur].location, + q->items[cur].type, type); + } +} +#endif // DEBUG + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/regex/nfa/nfa_api_util.h b/regex/nfa/nfa_api_util.h new file mode 100644 index 000000000..affc5f38f --- /dev/null +++ b/regex/nfa/nfa_api_util.h @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2015, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef NFA_API_UTIL_H +#define NFA_API_UTIL_H + +#include "nfa_api_queue.h" +#include "ue2common.h" + +/* returns the byte prior to the given location, NUL if not available */ +static really_inline +u8 queue_prev_byte(const struct mq *q, s64a loc) { + if (loc <= 0) { + if (1LL - loc > (s64a)q->hlength) { + return 0; /* assume NUL for start of stream write */ + } + // In the history buffer. + assert(q->history); + assert(q->hlength >= (u64a)(loc * -1)); + return q->history[q->hlength - 1 + loc]; + } else { + // In the stream write buffer. + assert(q->buffer); + assert(q->length >= (u64a)loc); + return q->buffer[loc - 1]; + } +} + +/* this is a modified version of pushQueue where we statically know the state of + * the queue. Does not attempt to merge and inserts at the given queue + * position. */ +static really_inline +void pushQueueAt(struct mq * restrict q, u32 pos, u32 e, s64a loc) { + assert(pos == q->end); + DEBUG_PRINTF("pushing %u@%lld -> %u\n", e, loc, q->end); + assert(q->end < MAX_MQE_LEN); + assert(e < MQE_INVALID); +/* stop gcc getting too smart for its own good */ +/* assert(!q->end || q->items[q->end - 1].location <= loc); */ + assert(q->end || e == MQE_START); + +#ifndef NDEBUG + // We assert that the event is different from its predecessor. If it's a + // dupe, you should have used the ordinary pushQueue call. + if (q->end) { + UNUSED struct mq_item *prev = &q->items[q->end - 1]; + assert(prev->type != e || prev->location != loc); + } +#endif + + struct mq_item *item = &q->items[pos]; + item->type = e; + item->location = loc; + item->som = 0; + q->end = pos + 1; +} +#endif diff --git a/regex/nfa/nfa_internal.h b/regex/nfa/nfa_internal.h new file mode 100644 index 000000000..ad27e28b1 --- /dev/null +++ b/regex/nfa/nfa_internal.h @@ -0,0 +1,266 @@ +/* + * Copyright (c) 2015-2020, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + \brief Declarations for the main NFA engine types and structures. +*/ +#ifndef NFA_INTERNAL_H +#define NFA_INTERNAL_H + +#ifdef __cplusplus +extern "C" +{ +#endif + +#include "ue2common.h" + +// Constants + +#define MO_INVALID_IDX 0xffffffff /**< index meaning value is invalid */ + +// Flags (used in NFA::flags) + +#define NFA_ACCEPTS_EOD 1U /**< can produce matches on EOD. */ +#define NFA_ZOMBIE 2U /**< supports zombies */ + +// Common data structures for NFAs + +enum NFAEngineType { + LIMEX_NFA_32, + LIMEX_NFA_64, + LIMEX_NFA_128, + LIMEX_NFA_256, + LIMEX_NFA_384, + LIMEX_NFA_512, + MCCLELLAN_NFA_8, /**< magic pseudo nfa */ + MCCLELLAN_NFA_16, /**< magic pseudo nfa */ + GOUGH_NFA_8, /**< magic pseudo nfa */ + GOUGH_NFA_16, /**< magic pseudo nfa */ + MPV_NFA, /**< magic pseudo nfa */ + LBR_NFA_DOT, /**< magic pseudo nfa */ + LBR_NFA_VERM, /**< magic pseudo nfa */ + LBR_NFA_NVERM, /**< magic pseudo nfa */ + LBR_NFA_SHUF, /**< magic pseudo nfa */ + LBR_NFA_TRUF, /**< magic pseudo nfa */ + CASTLE_NFA, /**< magic pseudo nfa */ + SHENG_NFA, /**< magic pseudo nfa */ + TAMARAMA_NFA, /**< magic nfa container */ + MCSHENG_NFA_8, /**< magic pseudo nfa */ + MCSHENG_NFA_16, /**< magic pseudo nfa */ + SHENG_NFA_32, /**< magic pseudo nfa */ + SHENG_NFA_64, /**< magic pseudo nfa */ + MCSHENG_64_NFA_8, /**< magic pseudo nfa */ + MCSHENG_64_NFA_16, /**< magic pseudo nfa */ + /** \brief bogus NFA - not used */ + INVALID_NFA +}; + +/** \brief header for the NFA implementation. */ +struct ALIGN_CL_DIRECTIVE NFA { + u32 flags; + + /** \brief The size in bytes of the NFA engine. The engine is + * serialized to the extent that copying length bytes back into a + * 16-byte aligned memory location yields a structure that has the same + * behaviour as the original engine. */ + u32 length; + + /** \brief Active implementation used by this NFAEngineType */ + u8 type; + + u8 rAccelType; + u8 rAccelOffset; + u8 maxBiAnchoredWidth; /**< if non zero, max width of the block */ + + union { + u8 c; + u16 dc; + u8 array[2]; + } rAccelData; + + u32 queueIndex; /**< index of the associated queue in scratch */ + + /** \brief The number of valid positions/states for this NFA. Debug only */ + u32 nPositions; + + /** \brief Size of the state required in scratch space. + * + * This state has less strict size requirements (as it doesn't go in stream + * state) and does not persist between stream writes. + */ + u32 scratchStateSize; + + /** \brief Size of the state required in stream state. + * + * This encompasses all state stored by the engine that must persist between + * stream writes. */ + u32 streamStateSize; + + u32 maxWidth; /**< longest possible match in this NFA, 0 if unbounded */ + u32 minWidth; /**< minimum bytes required to match this NFA */ + u32 maxOffset; /**< non zero: maximum offset this pattern can match at */ + + /* Note: implementation (e.g. a LimEx) directly follows struct in memory */ +} ; + +// Accessor macro for the implementation NFA: we do things this way to avoid +// type-punning warnings. +#define getImplNfa(nfa) \ + ((const void *)((const char *)(nfa) + sizeof(struct NFA))) + +// Non-const version of the above, used at compile time. +#define getMutableImplNfa(nfa) ((char *)(nfa) + sizeof(struct NFA)) + +static really_inline u32 nfaAcceptsEod(const struct NFA *nfa) { + return nfa->flags & NFA_ACCEPTS_EOD; +} + +static really_inline u32 nfaSupportsZombie(const struct NFA *nfa) { + return nfa->flags & NFA_ZOMBIE; +} + +/** \brief True if the given type (from NFA::type) is a McClellan DFA. */ +static really_inline int isMcClellanType(u8 t) { + return t == MCCLELLAN_NFA_8 || t == MCCLELLAN_NFA_16; +} + +/** \brief True if the given type (from NFA::type) is a Sheng-McClellan hybrid + * DFA. */ +static really_inline int isShengMcClellanType(u8 t) { + return t == MCSHENG_NFA_8 || t == MCSHENG_NFA_16 || + t == MCSHENG_64_NFA_8 || t == MCSHENG_64_NFA_16; +} + +/** \brief True if the given type (from NFA::type) is a Gough DFA. */ +static really_inline int isGoughType(u8 t) { + return t == GOUGH_NFA_8 || t == GOUGH_NFA_16; +} + +/** \brief True if the given type (from NFA::type) is a Sheng DFA. */ +static really_inline int isSheng16Type(u8 t) { + return t == SHENG_NFA; +} + +/** \brief True if the given type (from NFA::type) is a Sheng32 DFA. */ +static really_inline int isSheng32Type(u8 t) { + return t == SHENG_NFA_32; +} + +/** \brief True if the given type (from NFA::type) is a Sheng64 DFA. */ +static really_inline int isSheng64Type(u8 t) { + return t == SHENG_NFA_64; +} + +/** \brief True if the given type (from NFA::type) is a Sheng16/32/64 DFA. */ +static really_inline int isShengType(u8 t) { + return t == SHENG_NFA || t == SHENG_NFA_32 || t == SHENG_NFA_64; +} + +/** + * \brief True if the given type (from NFA::type) is a McClellan, Gough or + * Sheng DFA. + */ +static really_inline int isDfaType(u8 t) { + return isMcClellanType(t) || isGoughType(t) || isShengType(t) + || isShengMcClellanType(t); +} + +static really_inline int isBigDfaType(u8 t) { + return t == MCCLELLAN_NFA_16 || t == MCSHENG_NFA_16 || t == GOUGH_NFA_16; +} + +static really_inline int isSmallDfaType(u8 t) { + return isDfaType(t) && !isBigDfaType(t); +} + +/** \brief True if the given type (from NFA::type) is an NFA. */ +static really_inline int isNfaType(u8 t) { + switch (t) { + case LIMEX_NFA_32: + case LIMEX_NFA_64: + case LIMEX_NFA_128: + case LIMEX_NFA_256: + case LIMEX_NFA_384: + case LIMEX_NFA_512: + return 1; + default: + break; + } + return 0; +} + +/** \brief True if the given type (from NFA::type) is an LBR. */ +static really_inline +int isLbrType(u8 t) { + return t == LBR_NFA_DOT || t == LBR_NFA_VERM || t == LBR_NFA_NVERM || + t == LBR_NFA_SHUF || t == LBR_NFA_TRUF; +} + +/** \brief True if the given type (from NFA::type) is a container engine. */ +static really_inline +int isContainerType(u8 t) { + return t == TAMARAMA_NFA; +} + +static really_inline +int isMultiTopType(u8 t) { + return !isDfaType(t) && !isLbrType(t); +} + +/** Macros used in place of unimplemented NFA API functions for a given + * engine. */ +#if !defined(_WIN32) + +/* Use for functions that return an integer. */ +#define NFA_API_NO_IMPL(...) \ + ({ \ + assert(!"not implemented for this engine!"); \ + 0; /* return value, for places that need it */ \ + }) + +/* Use for _zombie_status functions. */ +#define NFA_API_ZOMBIE_NO_IMPL(...) \ + ({ \ + assert(!"not implemented for this engine!"); \ + NFA_ZOMBIE_NO; \ + }) + +#else + +/* Simpler implementation for compilers that don't like the GCC extension used + * above. */ +#define NFA_API_NO_IMPL(...) 0 +#define NFA_API_ZOMBIE_NO_IMPL(...) NFA_ZOMBIE_NO + +#endif + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/regex/nfa/nfa_rev_api.h b/regex/nfa/nfa_rev_api.h new file mode 100644 index 000000000..370f96ef6 --- /dev/null +++ b/regex/nfa/nfa_rev_api.h @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2015, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Reverse-acceleration optimizations for the NFA API block mode scans. + */ + +#ifndef NFA_REV_API_H +#define NFA_REV_API_H + +#include "accel.h" +#include "nfa_internal.h" +#include "vermicelli.h" +#include "util/unaligned.h" + +static really_inline +size_t nfaRevAccel_i(const struct NFA *nfa, const u8 *buffer, size_t length) { + DEBUG_PRINTF("checking rev accel mw %u\n", nfa->minWidth); + assert(nfa->rAccelOffset >= 1); + assert(nfa->rAccelOffset <= nfa->minWidth); + + const u8 *rv; // result for accel engine + + switch (nfa->rAccelType) { + case ACCEL_RVERM: + DEBUG_PRINTF("ACCEL_RVERM\n"); + if (length + 1 - nfa->rAccelOffset < 16) { + break; + } + + rv = rvermicelliExec(nfa->rAccelData.c, 0, buffer, + buffer + length + 1 - nfa->rAccelOffset); + length = (size_t)(rv - buffer + nfa->rAccelOffset); + break; + case ACCEL_RVERM_NOCASE: + DEBUG_PRINTF("ACCEL_RVERM_NOCASE\n"); + if (length + 1 - nfa->rAccelOffset < 16) { + break; + } + + rv = rvermicelliExec(nfa->rAccelData.c, 1, buffer, + buffer + length + 1 - nfa->rAccelOffset); + length = (size_t)(rv - buffer + nfa->rAccelOffset); + break; + case ACCEL_RDVERM: + DEBUG_PRINTF("ACCEL_RDVERM\n"); + if (length + 1 - nfa->rAccelOffset < 17) { + break; + } + + rv = rvermicelliDoubleExec(nfa->rAccelData.array[0], + nfa->rAccelData.array[1], 0, buffer, + buffer + length + 1 - nfa->rAccelOffset); + length = (size_t)(rv - buffer + nfa->rAccelOffset); + break; + case ACCEL_RDVERM_NOCASE: + DEBUG_PRINTF("ACCEL_RVERM_NOCASE\n"); + if (length + 1 - nfa->rAccelOffset < 17) { + break; + } + + rv = rvermicelliDoubleExec(nfa->rAccelData.array[0], + nfa->rAccelData.array[1], 1, buffer, + buffer + length + 1 - nfa->rAccelOffset); + length = (size_t)(rv - buffer + nfa->rAccelOffset); + break; + case ACCEL_REOD: + DEBUG_PRINTF("ACCEL_REOD\n"); + if (buffer[length - nfa->rAccelOffset] != nfa->rAccelData.c) { + return 0; + } + break; + case ACCEL_REOD_NOCASE: + DEBUG_PRINTF("ACCEL_REOD_NOCASE\n"); + if ((buffer[length - nfa->rAccelOffset] & CASE_CLEAR) != + nfa->rAccelData.c) { + return 0; + } + break; + case ACCEL_RDEOD: + DEBUG_PRINTF("ACCEL_RDEOD\n"); + if (unaligned_load_u16(buffer + length - nfa->rAccelOffset) != + nfa->rAccelData.dc) { + return 0; + } + break; + case ACCEL_RDEOD_NOCASE: + DEBUG_PRINTF("ACCEL_RDEOD_NOCASE\n"); + if ((unaligned_load_u16(buffer + length - nfa->rAccelOffset) & + DOUBLE_CASE_CLEAR) != nfa->rAccelData.dc) { + return 0; + } + break; + default: + assert(!"not here"); + } + + if (nfa->minWidth > length) { + DEBUG_PRINTF("post-accel, scan skipped: %zu < min %u bytes\n", length, + nfa->minWidth); + return 0; + } + + return length; +} + +/** \brief Reverse acceleration check. Returns a new length for the block, + * guaranteeing that a match cannot occur beyond that point. */ +static really_inline +size_t nfaRevAccelCheck(const struct NFA *nfa, const u8 *buffer, + size_t length) { + assert(nfa); + + // If this block is not long enough to satisfy the minimum width + // constraint on this NFA, we can avoid the scan altogether. + if (nfa->minWidth > length) { + DEBUG_PRINTF("scan skipped: %zu < min %u bytes\n", length, + nfa->minWidth); + return 0; + } + + if (nfa->rAccelType == ACCEL_NONE) { + DEBUG_PRINTF("no rev accel available\n"); + return length; + } + + size_t rv_length = nfaRevAccel_i(nfa, buffer, length); + assert(rv_length <= length); + return rv_length; +} + +#endif diff --git a/regex/nfa/repeat.c b/regex/nfa/repeat.c new file mode 100644 index 000000000..946459415 --- /dev/null +++ b/regex/nfa/repeat.c @@ -0,0 +1,1611 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief API for handling bounded repeats. + * + * This file provides an internal API for handling bounded repeats of character + * classes. It is used by the Large Bounded Repeat (LBR) engine and by the + * bounded repeat handling in the LimEx NFA engine as well. + */ +#include "repeat.h" +#include "util/bitutils.h" +#include "util/multibit.h" +#include "util/pack_bits.h" +#include "util/partial_store.h" +#include "util/unaligned.h" + +#ifndef __KERNEL__ +#include +#include +#else +#include +#include +#include +#define UINT32_MAX U32_MAX +#endif + +/** \brief Returns the total capacity of the ring. + * Note that it's currently one greater than repeatMax so that we can handle + * cases where the tug and pos triggers overlap. */ +static +u32 ringCapacity(const struct RepeatInfo *info) { + return info->repeatMax + 1; +} + +/** \brief Returns the number of elements currently in the ring. Note that if + * the first and last indices are equal, the ring is full. */ +static +u32 ringOccupancy(const struct RepeatRingControl *xs, const u32 ringSize) { + if (xs->last > xs->first) { + return xs->last - xs->first; + } else { // wrapped + return ringSize - (xs->first - xs->last); + } +} + +/** \brief Returns the offset of the _last_ top stored in the ring. */ +static +u64a ringLastTop(const struct RepeatRingControl *xs, const u32 ringSize) { + return xs->offset + ringOccupancy(xs, ringSize) - 1; +} + +#if !defined(NDEBUG) || defined(DUMP_SUPPORT) +/** \brief For debugging: returns the total capacity of the range list. */ +static UNUSED +u32 rangeListCapacity(const struct RepeatInfo *info) { + u32 d = info->repeatMax - info->repeatMin; + assert(d > 0); // should be in a RING model! + return 2 * ((info->repeatMax / d) + 1); +} +#endif + +#ifdef DEBUG +static +void dumpRing(const struct RepeatInfo *info, const struct RepeatRingControl *xs, + const u8 *ring) { + const u32 ringSize = ringCapacity(info); + DEBUG_PRINTF("ring (occ %u/%u, %u->%u): ", ringOccupancy(xs, ringSize), + ringSize, xs->first, xs->last); + + u16 i = xs->first, n = 0; + do { + if (mmbit_isset(ring, ringSize, i)) { + u64a ringOffset = xs->offset + n; + printf("%llu ", ringOffset); + } + ++i, ++n; + if (i == ringSize) { + i = 0; + } + } while (i != xs->last); + printf("\n"); +} + +static +void dumpRange(const struct RepeatInfo *info, + const struct RepeatRangeControl *xs, const u16 *ring) { + const u32 ringSize = rangeListCapacity(info); + DEBUG_PRINTF("ring (occ %u/%u): ", xs->num, ringSize); + + if (xs->num) { + for (u32 i = 0; i < xs->num; i++) { + printf("%llu ", xs->offset + unaligned_load_u16(ring + i)); + } + } else { + printf("empty"); + } + printf("\n"); +} + +static +void dumpBitmap(const struct RepeatBitmapControl *xs) { + DEBUG_PRINTF("bitmap (base=%llu): ", xs->offset); + u64a bitmap = xs->bitmap; + while (bitmap) { + printf("%llu ", xs->offset + findAndClearLSB_64(&bitmap)); + } + printf("\n"); +} + +static +void dumpTrailer(const struct RepeatInfo *info, + const struct RepeatTrailerControl *xs) { + const u64a m_width = info->repeatMax - info->repeatMin; + DEBUG_PRINTF("trailer: current extent is [%llu,%llu]", xs->offset, + xs->offset + m_width); + u64a bitmap = xs->bitmap; + if (bitmap) { + printf(", also matches at: "); + while (bitmap) { + u32 idx = findAndClearMSB_64(&bitmap); + printf("%llu ", xs->offset - idx - 1); + } + } else { + printf(", no earlier matches"); + } + printf("\n"); +} + +#endif // DEBUG + +#ifndef NDEBUG +/** \brief For debugging: returns true if the range is ordered with no dupes. */ +static UNUSED +int rangeListIsOrdered(const struct RepeatRangeControl *xs, const u16 *ring) { + for (u32 i = 1; i < xs->num; i++) { + u16 a = unaligned_load_u16(ring + i - 1); + u16 b = unaligned_load_u16(ring + i); + if (a >= b) { + return 0; + } + } + return 1; +} +#endif + +u64a repeatLastTopRing(const struct RepeatInfo *info, + const union RepeatControl *ctrl) { + const u32 ringSize = ringCapacity(info); + return ringLastTop(&ctrl->ring, ringSize); +} + +u64a repeatLastTopRange(const union RepeatControl *ctrl, const void *state) { + const u16 *ring = (const u16 *)state; + const struct RepeatRangeControl *xs = &ctrl->range; + assert(xs->num); + return xs->offset + unaligned_load_u16(ring + xs->num - 1); +} + +u64a repeatLastTopBitmap(const union RepeatControl *ctrl) { + const struct RepeatBitmapControl *xs = &ctrl->bitmap; + if (!xs->bitmap) { + /* last top was too long ago */ + return 0; + } + return xs->offset + 63 - clz64(xs->bitmap); +} + +u64a repeatLastTopTrailer(const struct RepeatInfo *info, + const union RepeatControl *ctrl) { + const struct RepeatTrailerControl *xs = &ctrl->trailer; + assert(xs->offset >= info->repeatMin); + return xs->offset - info->repeatMin; +} + +u64a repeatNextMatchRing(const struct RepeatInfo *info, + const union RepeatControl *ctrl, const void *state, + u64a offset) { + const struct RepeatRingControl *xs = &ctrl->ring; + const u8 *ring = (const u8 *)state; + const u32 ringSize = ringCapacity(info); + + // We should have at least one top stored. + assert(mmbit_any(ring, ringSize)); + assert(info->repeatMax < REPEAT_INF); + + // Increment offset, as we want the NEXT match. + offset++; + + const u64a base_offset = xs->offset; + DEBUG_PRINTF("offset=%llu, base_offset=%llu\n", offset, base_offset); + + u64a delta = offset - base_offset; + if (offset < base_offset || delta < info->repeatMin) { + DEBUG_PRINTF("before min repeat\n"); + return base_offset + info->repeatMin; + } + if (offset > ringLastTop(xs, ringSize) + info->repeatMax) { + DEBUG_PRINTF("ring is stale\n"); + return 0; // no more matches + } + + DEBUG_PRINTF("delta=%llu\n", delta); + u64a lower = delta > info->repeatMax ? delta - info->repeatMax : 0; + DEBUG_PRINTF("lower=%llu\n", lower); + + assert(lower < ringSize); + + // First scan, either to xs->last if there's no wrap-around or ringSize + // (end of the underlying multibit) if we are wrapping. + + u32 begin = xs->first + lower; + if (begin >= ringSize) { + // This branch and sub tested a lot faster than using % (integer div). + begin -= ringSize; + } + const u32 end = begin >= xs->last ? ringSize : xs->last; + u32 i = mmbit_iterate_bounded(ring, ringSize, begin, end); + if (i != MMB_INVALID) { + u32 j = i - begin + lower; + return MAX(offset, base_offset + j + info->repeatMin); + } + + // A second scan is necessary if we need to cope with wrap-around in the + // ring buffer. + + if (begin >= xs->last) { + i = mmbit_iterate_bounded(ring, ringSize, 0, xs->last); + if (i != MMB_INVALID) { + u32 j = i + (ringSize - begin) + lower; + return MAX(offset, base_offset + j + info->repeatMin); + } + } + + return 0; +} + +u64a repeatNextMatchRange(const struct RepeatInfo *info, + const union RepeatControl *ctrl, const void *state, + u64a offset) { + const struct RepeatRangeControl *xs = &ctrl->range; + const u16 *ring = (const u16 *)state; + + assert(xs->num > 0); + assert(xs->num <= rangeListCapacity(info)); + assert(rangeListIsOrdered(xs, ring)); + assert(info->repeatMax < REPEAT_INF); + + for (u32 i = 0; i < xs->num; i++) { + u64a base = xs->offset + unaligned_load_u16(ring + i); + u64a first = base + info->repeatMin; + if (offset < first) { + return first; + } + if (offset < base + info->repeatMax) { + return offset + 1; + } + } + + return 0; +} + +u64a repeatNextMatchBitmap(const struct RepeatInfo *info, + const union RepeatControl *ctrl, u64a offset) { + const struct RepeatBitmapControl *xs = &ctrl->bitmap; + const u64a base = xs->offset; + u64a bitmap = xs->bitmap; + + // FIXME: quick exit if there is no match, based on last top in bitmap? + + while (bitmap) { + u64a top = base + findAndClearLSB_64(&bitmap); + if (offset < top + info->repeatMin) { + return top + info->repeatMin; + } + if (offset < top + info->repeatMax) { + return offset + 1; + } + } + + return 0; // No more matches. +} + +u64a repeatNextMatchTrailer(const struct RepeatInfo *info, + const union RepeatControl *ctrl, u64a offset) { + const struct RepeatTrailerControl *xs = &ctrl->trailer; + const u32 m_width = info->repeatMax - info->repeatMin; + + DEBUG_PRINTF("offset=%llu, xs->offset=%llu\n", offset, xs->offset); + DEBUG_PRINTF("{%u,%u} repeat, m_width=%u\n", info->repeatMin, + info->repeatMax, m_width); + + assert(xs->offset >= info->repeatMin); + + if (offset >= xs->offset + m_width) { + DEBUG_PRINTF("no more matches\n"); + return 0; + } + + if (offset >= xs->offset) { + DEBUG_PRINTF("inside most recent match window, next match %llu\n", + offset + 1); + return offset + 1; + } + + // Offset is before the match window, we need to consult the bitmap of + // earlier match offsets. + u64a bitmap = xs->bitmap; + + u64a diff = xs->offset - offset; + DEBUG_PRINTF("diff=%llu\n", diff); + if (diff <= 64) { + assert(diff); + bitmap &= (1ULL << (diff - 1)) - 1; + } + DEBUG_PRINTF("bitmap = 0x%llx\n", bitmap); + if (bitmap) { + u32 idx = 63 - clz64(bitmap); + DEBUG_PRINTF("clz=%u, idx = %u -> offset %llu\n", clz64(bitmap), idx, + xs->offset - idx); + DEBUG_PRINTF("next match at %llu\n", xs->offset - idx - 1); + u64a next_match = xs->offset - idx - 1; + assert(next_match > offset); + return next_match; + } + + DEBUG_PRINTF("next match is start of match window, %llu\n", xs->offset); + return xs->offset; +} + +/** \brief Store the first top in the ring buffer. */ +static +void storeInitialRingTop(struct RepeatRingControl *xs, u8 *ring, + u64a offset, const u32 ringSize) { + DEBUG_PRINTF("ring=%p, ringSize=%u\n", ring, ringSize); + xs->offset = offset; + mmbit_clear(ring, ringSize); + mmbit_set(ring, ringSize, 0); + xs->first = 0; + xs->last = 1; +} + +static really_inline +char ringIsStale(const struct RepeatRingControl *xs, const u32 ringSize, + const u64a offset) { + u64a finalMatch = ringLastTop(xs, ringSize); + if (offset - finalMatch >= ringSize) { + DEBUG_PRINTF("all matches in ring are stale\n"); + return 1; + } + + return 0; +} + +void repeatStoreRing(const struct RepeatInfo *info, union RepeatControl *ctrl, + void *state, u64a offset, char is_alive) { + struct RepeatRingControl *xs = &ctrl->ring; + u8 *ring = (u8 *)state; + const u32 ringSize = ringCapacity(info); + assert(ringSize > 0); + + DEBUG_PRINTF("storing top for offset %llu in ring\n", offset); + + if (!is_alive || ringIsStale(xs, ringSize, offset)) { + storeInitialRingTop(xs, ring, offset, ringSize); + } else { + assert(offset > ringLastTop(xs, ringSize)); // Dupe or out of order. + u32 occ = ringOccupancy(xs, ringSize); + u64a diff = offset - xs->offset; + DEBUG_PRINTF("diff=%llu, occ=%u\n", diff, occ); + if (diff >= ringSize) { + u32 push = diff - ringSize + 1; + DEBUG_PRINTF("push ring %u\n", push); + xs->first += push; + if (xs->first >= ringSize) { + xs->first -= ringSize; + } + xs->offset += push; + diff -= push; + occ -= push; + } + + // There's now room in the ring for this top, so we write a run of + // zeroes, then a one. + DEBUG_PRINTF("diff=%llu, occ=%u\n", diff, occ); + assert(diff < ringSize); + assert(diff >= occ); + u32 n = diff - occ; + + u32 i = xs->last + n; + + mmbit_unset_range(ring, ringSize, xs->last, MIN(i, ringSize)); + if (i >= ringSize) { + i -= ringSize; + mmbit_unset_range(ring, ringSize, 0, i); + } + + assert(i != xs->first); + DEBUG_PRINTF("set bit %u\n", i); + mmbit_set(ring, ringSize, i); + xs->last = i + 1; + if (xs->last == ringSize) { + xs->last = 0; + } + } + + // Our ring indices shouldn't have spiraled off into uncharted space. + assert(xs->first < ringSize); + assert(xs->last < ringSize); + +#ifdef DEBUG + DEBUG_PRINTF("post-store ring state\n"); + dumpRing(info, xs, ring); +#endif + + // The final top stored in our ring should be the one we just wrote in. + assert(ringLastTop(xs, ringSize) == offset); +} + +static really_inline +void storeInitialRangeTop(struct RepeatRangeControl *xs, u16 *ring, + u64a offset) { + xs->offset = offset; + xs->num = 1; + unaligned_store_u16(ring, 0); +} + +void repeatStoreRange(const struct RepeatInfo *info, union RepeatControl *ctrl, + void *state, u64a offset, char is_alive) { + struct RepeatRangeControl *xs = &ctrl->range; + u16 *ring = (u16 *)state; + + if (!is_alive) { + DEBUG_PRINTF("storing initial top at %llu\n", offset); + storeInitialRangeTop(xs, ring, offset); + return; + } + + DEBUG_PRINTF("storing top at %llu, list currently has %u/%u elements\n", + offset, xs->num, rangeListCapacity(info)); + +#ifdef DEBUG + dumpRange(info, xs, ring); +#endif + + // Walk ring from front. Identify the number of stale elements, and shift + // the whole ring to delete them. + u32 i = 0; + for (; i < xs->num; i++) { + u64a this_offset = xs->offset + unaligned_load_u16(ring + i); + DEBUG_PRINTF("this_offset=%llu, diff=%llu\n", this_offset, + offset - this_offset); + if (offset - this_offset <= info->repeatMax) { + break; + } + } + + if (i == xs->num) { + DEBUG_PRINTF("whole ring is stale\n"); + storeInitialRangeTop(xs, ring, offset); + return; + } else if (i > 0) { + DEBUG_PRINTF("expiring %u stale tops\n", i); + u16 first_offset = unaligned_load_u16(ring + i); // first live top + for (u32 j = 0; j < xs->num - i; j++) { + u16 val = unaligned_load_u16(ring + i + j); + assert(val >= first_offset); + unaligned_store_u16(ring + j, val - first_offset); + } + xs->offset += first_offset; + xs->num -= i; + } + +#ifdef DEBUG + DEBUG_PRINTF("post-expire:\n"); + dumpRange(info, xs, ring); +#endif + + if (xs->num == 1) { + goto append; + } + + // Let d = repeatMax - repeatMin + // Examine penultimate entry x[-2]. + // If (offset - x[-2] <= d), then last entry x[-1] can be replaced with + // entry for offset. + assert(xs->num >= 2); + u32 d = info->repeatMax - info->repeatMin; + u64a penultimate_offset = + xs->offset + unaligned_load_u16(ring + xs->num - 2); + if (offset - penultimate_offset <= d) { + assert(offset - xs->offset <= (u16)-1); + unaligned_store_u16(ring + xs->num - 1, offset - xs->offset); + goto done; + } + + // Otherwise, write a new entry for offset and return. + +append: + assert(offset - xs->offset <= (u16)-1); + assert(xs->num < rangeListCapacity(info)); + unaligned_store_u16(ring + xs->num, offset - xs->offset); + xs->num++; + +done: + assert(rangeListIsOrdered(xs, ring)); +} + +void repeatStoreBitmap(const struct RepeatInfo *info, union RepeatControl *ctrl, + u64a offset, char is_alive) { + DEBUG_PRINTF("{%u,%u} repeat, storing top at %llu\n", info->repeatMin, + info->repeatMax, offset); + + struct RepeatBitmapControl *xs = &ctrl->bitmap; + if (!is_alive || !xs->bitmap) { + DEBUG_PRINTF("storing initial top at %llu\n", offset); + xs->offset = offset; + xs->bitmap = 1U; + return; + } + +#ifdef DEBUG + DEBUG_PRINTF("pre-store:\n"); + dumpBitmap(xs); +#endif + + assert(offset >= xs->offset); + + u64a last_top = xs->offset + 63 - clz64(xs->bitmap); + if (offset > last_top + info->repeatMax) { + DEBUG_PRINTF("bitmap stale, storing initial top\n"); + xs->offset = offset; + xs->bitmap = 1U; + return; + } + + u64a diff = offset - xs->offset; + if (diff >= info->repeatMax + 1) { + DEBUG_PRINTF("need expire, diff=%llu\n", diff); + u64a push = diff - info->repeatMax; + xs->offset += push; + xs->bitmap = push >= 64 ? 0 : xs->bitmap >> push; + DEBUG_PRINTF("pushed xs->offset to %llu\n", xs->offset); + } + + // Write a new entry. + diff = offset - xs->offset; + assert(diff < 64); + xs->bitmap |= (1ULL << diff); + +#ifdef DEBUG + DEBUG_PRINTF("post-store:\n"); + dumpBitmap(xs); +#endif +} + +/** \brief Returns 1 if the ring has a match between (logical) index \a lower + * and \a upper, excluding \a upper. */ +static +int ringHasMatch(const struct RepeatRingControl *xs, const u8 *ring, + const u32 ringSize, u32 lower, u32 upper) { + assert(lower < upper); + assert(lower < ringSize); + assert(upper <= ringSize); + + u32 i = xs->first + lower; + if (i >= ringSize) { + i -= ringSize; + } + + // Performance tweak: if we're looking at a fixed repeat, we can just use + // mmbit_isset. + if (lower + 1 == upper) { + return mmbit_isset(ring, ringSize, i); + } + + u32 end = xs->first + upper; + if (end >= ringSize) { + end -= ringSize; + } + + // First scan, either to end if there's no wrap-around or ringSize (end of + // the underlying multibit) if we are wrapping. + + u32 scan_end = i < end ? end : ringSize; + u32 m = mmbit_iterate_bounded(ring, ringSize, i, scan_end); + if (m != MMB_INVALID) { + return 1; + } + + // A second scan is necessary if we need to cope with wrap-around in the + // ring buffer. + + if (i >= end) { + m = mmbit_iterate_bounded(ring, ringSize, 0, end); + return m != MMB_INVALID; + } + + return 0; +} + +/** Return a mask of ones in bit positions [0..v]. */ +static really_inline +u64a mask_ones_to(u32 v) { + if (v < 63) { + return (1ULL << (v + 1)) - 1; + } else { + return ~(0ULL); + } +} + +void repeatStoreTrailer(const struct RepeatInfo *info, + union RepeatControl *ctrl, u64a offset, char is_alive) { + DEBUG_PRINTF("{%u,%u} repeat, top at %llu\n", info->repeatMin, + info->repeatMax, offset); + + struct RepeatTrailerControl *xs = &ctrl->trailer; + + /* The TRAILER repeat model stores the following data in its control block: + * + * 1. offset, which is the min extent of the most recent match window + * (i.e. corresponding to the most recent top) + * 2. bitmap, which is a bitmap of up to repeatMin matches before + * the min extent offset. + */ + + const u64a next_extent = offset + info->repeatMin; + + if (!is_alive) { + xs->offset = next_extent; + xs->bitmap = 0; + DEBUG_PRINTF("initial top, set extent to %llu\n", next_extent); + return; + } + +#ifdef DEBUG + DEBUG_PRINTF("pre-store:\n"); + dumpTrailer(info, xs); +#endif + + const u32 m_width = info->repeatMax - info->repeatMin; + DEBUG_PRINTF("most recent match window is [%llu,%llu]\n", xs->offset, + xs->offset + m_width); + + assert(next_extent > xs->offset); + u64a diff = next_extent - xs->offset; + DEBUG_PRINTF("diff=%llu, m_width=%u\n", diff, m_width); + + assert(diff); + xs->bitmap = diff < 64 ? xs->bitmap << diff : 0; + + // Switch on bits in the bitmask corresponding to matches in the previous + // match window. + if (diff <= m_width) { + u64a m = mask_ones_to(diff - 1); + xs->bitmap |= m; + } else { + u64a shift = diff - m_width - 1; + if (shift < 64) { + u64a m = mask_ones_to(m_width); + m <<= shift; + xs->bitmap |= m; + } + } + + DEBUG_PRINTF("bitmap=0x%llx\n", xs->bitmap); + + // Update max extent. + xs->offset = next_extent; + + // Trim stale history: we only need repeatMin bytes of history. + if (info->repeatMin < 63) { + u64a mask = (1ULL << (info->repeatMin + 1)) - 1; + xs->bitmap &= mask; + } + +#ifdef DEBUG + DEBUG_PRINTF("post-store:\n"); + dumpTrailer(info, xs); +#endif +} + +enum RepeatMatch repeatHasMatchRing(const struct RepeatInfo *info, + const union RepeatControl *ctrl, + const void *state, u64a offset) { + const struct RepeatRingControl *xs = &ctrl->ring; + const u8 *ring = (const u8 *)state; + const u32 ringSize = ringCapacity(info); + + assert(mmbit_any(ring, ringSize)); + assert(offset >= xs->offset); + + DEBUG_PRINTF("check: offset=%llu, repeat=[%u,%u]\n", offset, + info->repeatMin, info->repeatMax); +#ifdef DEBUG + DEBUG_PRINTF("ring state\n"); + dumpRing(info, xs, ring); +#endif + + if (offset - xs->offset < info->repeatMin) { + DEBUG_PRINTF("haven't even seen repeatMin bytes yet!\n"); + return REPEAT_NOMATCH; + } + + if (offset - ringLastTop(xs, ringSize) >= ringSize) { + DEBUG_PRINTF("ring is stale\n"); + return REPEAT_STALE; + } + + // If we're not stale, delta fits in the range [repeatMin, lastTop + + // repeatMax], which fits in a u32. + assert(offset - xs->offset < UINT32_MAX); + u32 delta = (u32)(offset - xs->offset); + DEBUG_PRINTF("delta=%u\n", delta); + + // Find the bounds on possible matches in the ring buffer. + u32 lower = delta > info->repeatMax ? delta - info->repeatMax : 0; + u32 upper = MIN(delta - info->repeatMin + 1, ringOccupancy(xs, ringSize)); + + if (lower >= upper) { + DEBUG_PRINTF("no matches to check\n"); + return REPEAT_NOMATCH; + } + + DEBUG_PRINTF("possible match indices=[%u,%u]\n", lower, upper); + if (ringHasMatch(xs, ring, ringSize, lower, upper)) { + return REPEAT_MATCH; + } + + return REPEAT_NOMATCH; +} + +enum RepeatMatch repeatHasMatchRange(const struct RepeatInfo *info, + const union RepeatControl *ctrl, + const void *state, u64a offset) { + const struct RepeatRangeControl *xs = &ctrl->range; + const u16 *ring = (const u16 *)state; + + assert(xs->num > 0); + assert(xs->num <= rangeListCapacity(info)); + assert(rangeListIsOrdered(xs, ring)); + + // Walk the ring. For each entry x: + // if (offset - x) falls inside repeat bounds, return success. + + // It may be worth doing tests on first and last elements first to bail + // early if the whole ring is too young or stale. + + DEBUG_PRINTF("check %u (of %u) elements, offset %llu, bounds={%u,%u}\n", + xs->num, rangeListCapacity(info), offset, + info->repeatMin, info->repeatMax); +#ifdef DEBUG + dumpRange(info, xs, ring); +#endif + + // Quick pre-check for minimum. + assert(offset >= xs->offset); + if (offset - xs->offset < info->repeatMin) { + DEBUG_PRINTF("haven't even seen repeatMin bytes yet!\n"); + return REPEAT_NOMATCH; + } + + // We check the most recent offset first, as we can establish staleness. + u64a match = xs->offset + unaligned_load_u16(ring + xs->num - 1); + assert(offset >= match); + u64a diff = offset - match; + if (diff > info->repeatMax) { + DEBUG_PRINTF("range list is stale\n"); + return REPEAT_STALE; + } else if (diff >= info->repeatMin && diff <= info->repeatMax) { + return REPEAT_MATCH; + } + + // Check the other offsets in the list. + u32 count = xs->num - 1; + for (u32 i = 0; i < count; i++) { + match = xs->offset + unaligned_load_u16(ring + i); + assert(offset >= match); + diff = offset - match; + if (diff >= info->repeatMin && diff <= info->repeatMax) { + return REPEAT_MATCH; + } + } + + return REPEAT_NOMATCH; +} + +enum RepeatMatch repeatHasMatchBitmap(const struct RepeatInfo *info, + const union RepeatControl *ctrl, + u64a offset) { + const struct RepeatBitmapControl *xs = &ctrl->bitmap; + + DEBUG_PRINTF("checking if offset=%llu is a match\n", offset); + +#ifdef DEBUG + dumpBitmap(xs); +#endif + + u64a bitmap = xs->bitmap; + if (!bitmap) { + DEBUG_PRINTF("no tops; stale\n"); + return REPEAT_STALE; + } + + // Quick pre-check for minimum. + const u64a base = xs->offset; + assert(offset >= base); + if (offset - base < info->repeatMin) { + DEBUG_PRINTF("haven't even seen repeatMin bytes yet!\n"); + return REPEAT_NOMATCH; + } + + // We check the most recent offset first, as we can establish staleness. + u64a match = base + findAndClearMSB_64(&bitmap); + DEBUG_PRINTF("offset=%llu, last_match %llu\n", offset, match); + assert(offset >= match); + u64a diff = offset - match; + if (diff > info->repeatMax) { + DEBUG_PRINTF("stale\n"); + return REPEAT_STALE; + } else if (diff >= info->repeatMin && diff <= info->repeatMax) { + return REPEAT_MATCH; + } + + while (bitmap) { + match = base + findAndClearLSB_64(&bitmap); + DEBUG_PRINTF("offset=%llu, last_match %llu\n", offset, match); + assert(offset >= match); + diff = offset - match; + if (diff >= info->repeatMin && diff <= info->repeatMax) { + return REPEAT_MATCH; + } + } + + return REPEAT_NOMATCH; +} + +enum RepeatMatch repeatHasMatchTrailer(const struct RepeatInfo *info, + const union RepeatControl *ctrl, + u64a offset) { + const struct RepeatTrailerControl *xs = &ctrl->trailer; + const u32 m_width = info->repeatMax - info->repeatMin; + + DEBUG_PRINTF("offset=%llu, xs->offset=%llu, xs->bitmap=0x%llx\n", offset, + xs->offset, xs->bitmap); + + if (offset > xs->offset + m_width) { + DEBUG_PRINTF("stale\n"); + return REPEAT_STALE; + } + + if (offset >= xs->offset) { + DEBUG_PRINTF("in match window\n"); + return REPEAT_MATCH; + } + + if (offset >= xs->offset - info->repeatMin) { + u32 idx = xs->offset - offset - 1; + DEBUG_PRINTF("check bitmap idx %u\n", idx); + assert(idx < 64); + if (xs->bitmap & (1ULL << idx)) { + DEBUG_PRINTF("match in bitmap\n"); + return REPEAT_MATCH; + } + } + + DEBUG_PRINTF("no match\n"); + return REPEAT_NOMATCH; +} + +/** \brief True if the given value can be packed into len bytes. */ +static really_inline +int fits_in_len_bytes(u64a val, u32 len) { + if (len >= 8) { + return 1; + } + return val <= (1ULL << (len * 8)); +} + +static really_inline +void storePackedRelative(char *dest, u64a val, u64a offset, u64a max, u32 len) { + assert(val <= offset); + assert(fits_in_len_bytes(max, len)); + u64a delta = offset - val; + if (delta >= max) { + delta = max; + } + DEBUG_PRINTF("delta %llu\n", delta); + assert(fits_in_len_bytes(delta, len)); + partial_store_u64a(dest, delta, len); +} + +static +void repeatPackRing(char *dest, const struct RepeatInfo *info, + const union RepeatControl *ctrl, u64a offset) { + const struct RepeatRingControl *xs = &ctrl->ring; + const u32 ring_indices_len = info->repeatMax < 254 ? 2 : 4; + const u32 offset_len = info->packedCtrlSize - ring_indices_len; + + // Write out packed relative base offset. + assert(info->packedCtrlSize > ring_indices_len); + storePackedRelative(dest, xs->offset, offset, info->horizon, offset_len); + + // Write out ring indices. + if (ring_indices_len == 4) { + unaligned_store_u16(dest + offset_len, xs->first); + unaligned_store_u16(dest + offset_len + 2, xs->last); + } else { + assert(xs->first < 256 && xs->last < 256); + u8 *indices = (u8 *)dest + offset_len; + indices[0] = xs->first; + indices[1] = xs->last; + } +} + +static +void repeatPackOffset(char *dest, const struct RepeatInfo *info, + const union RepeatControl *ctrl, u64a offset) { + const struct RepeatOffsetControl *xs = &ctrl->offset; + DEBUG_PRINTF("packing offset %llu [h %u]\n", xs->offset, info->horizon); + if (!info->packedCtrlSize) { + assert(info->type == REPEAT_ALWAYS); + DEBUG_PRINTF("externally guarded .*\n"); + return; + } + storePackedRelative(dest, xs->offset, offset, info->horizon, + info->packedCtrlSize); +} + +static +void repeatPackRange(char *dest, const struct RepeatInfo *info, + const union RepeatControl *ctrl, u64a offset) { + const struct RepeatRangeControl *xs = &ctrl->range; + + // Write out packed relative base offset. + assert(info->packedCtrlSize > 1); + storePackedRelative(dest, xs->offset, offset, info->horizon, + info->packedCtrlSize - 1); + + // Write out range number of elements. + dest[info->packedCtrlSize - 1] = xs->num; +} + +static +void repeatPackBitmap(char *dest, const struct RepeatInfo *info, + const union RepeatControl *ctrl, u64a offset) { + const struct RepeatBitmapControl *xs = &ctrl->bitmap; + const u32 bound = info->repeatMax; + + assert(offset >= xs->offset); + u64a new_base = offset > bound ? offset - bound : 0; + + // Shift bitmap to begin at new_base rather than xs->offset. + u64a bitmap = xs->bitmap; + if (new_base >= xs->offset) { + u64a shift = new_base - xs->offset; + bitmap = shift < 64 ? bitmap >> shift : 0; + } else { + u64a shift = xs->offset - new_base; + bitmap = shift < 64 ? bitmap << shift : 0; + } + + DEBUG_PRINTF("packing %llu into %u bytes\n", bitmap, info->packedCtrlSize); + + // Write out packed bitmap. + assert(fits_in_len_bytes(bitmap, info->packedCtrlSize)); + partial_store_u64a(dest, bitmap, info->packedCtrlSize); +} + +static +void repeatPackSparseOptimalP(char *dest, const struct RepeatInfo *info, + const union RepeatControl *ctrl, u64a offset) { + const struct RepeatRingControl *xs = &ctrl->ring; + // set ring index pointer according to patch count + const u32 ring_indices_len = info->patchCount < 254 ? 2 : 4; + const u32 offset_len = info->packedCtrlSize - ring_indices_len; + + // Write out packed relative base offset. + assert(info->packedCtrlSize > ring_indices_len); + storePackedRelative(dest, xs->offset, offset, info->horizon, offset_len); + + // Write out ring indices. + if (ring_indices_len == 4) { + unaligned_store_u16(dest + offset_len, xs->first); + unaligned_store_u16(dest + offset_len + 2, xs->last); + } else { + assert(xs->first < 256 && xs->last < 256); + u8 *indices = (u8 *)dest + offset_len; + indices[0] = xs->first; + indices[1] = xs->last; + } + +} + +static +void repeatPackTrailer(char *dest, const struct RepeatInfo *info, + const union RepeatControl *ctrl, u64a offset) { + const struct RepeatTrailerControl *xs = &ctrl->trailer; + + DEBUG_PRINTF("saving: offset=%llu, xs->offset=%llu, xs->bitmap=0x%llx\n", + offset, xs->offset, xs->bitmap); + + // XXX: xs->offset may be zero in the NFA path (effectively uninitialized). + u64a top; + if (xs->offset) { + assert(xs->offset >= info->repeatMin); + top = xs->offset - info->repeatMin; + } else { + top = 0; + } + + top = offset - top; // Pack top relative to offset. + + u64a v[2]; + v[0] = MIN(top, info->horizon); + v[1] = xs->bitmap; + + pack_bits_64(dest, v, info->packedFieldSizes, 2); +} + +void repeatPack(char *dest, const struct RepeatInfo *info, + const union RepeatControl *ctrl, u64a offset) { + assert(dest && info && ctrl); + + switch ((enum RepeatType)info->type) { + case REPEAT_RING: + repeatPackRing(dest, info, ctrl, offset); + break; + case REPEAT_FIRST: + case REPEAT_LAST: + repeatPackOffset(dest, info, ctrl, offset); + break; + case REPEAT_RANGE: + repeatPackRange(dest, info, ctrl, offset); + break; + case REPEAT_BITMAP: + repeatPackBitmap(dest, info, ctrl, offset); + break; + case REPEAT_SPARSE_OPTIMAL_P: + repeatPackSparseOptimalP(dest, info, ctrl, offset); + break; + case REPEAT_TRAILER: + repeatPackTrailer(dest, info, ctrl, offset); + break; + case REPEAT_ALWAYS: + /* nothing to do - no state */ + break; + } +} + +static really_inline +u64a loadPackedRelative(const char *src, u64a offset, u32 len) { + u64a delta = partial_load_u64a(src, len); + DEBUG_PRINTF("delta %llu\n", delta); + assert(offset >= delta); + return offset - delta; +} + +static +void repeatUnpackRing(const char *src, const struct RepeatInfo *info, + u64a offset, union RepeatControl *ctrl) { + struct RepeatRingControl *xs = &ctrl->ring; + const u32 ring_indices_len = info->repeatMax < 254 ? 2 : 4; + const u32 offset_len = info->packedCtrlSize - ring_indices_len; + xs->offset = loadPackedRelative(src, offset, offset_len); + if (ring_indices_len == 4) { + xs->first = unaligned_load_u16(src + offset_len); + xs->last = unaligned_load_u16(src + offset_len + 2); + } else { + const u8 *indices = (const u8 *)src + offset_len; + xs->first = indices[0]; + xs->last = indices[1]; + } +} + +static +void repeatUnpackOffset(const char *src, const struct RepeatInfo *info, + u64a offset, union RepeatControl *ctrl) { + struct RepeatOffsetControl *xs = &ctrl->offset; + if (!info->packedCtrlSize) { + assert(info->type == REPEAT_ALWAYS); + DEBUG_PRINTF("externally guarded .*\n"); + xs->offset = 0; + } else { + xs->offset = loadPackedRelative(src, offset, info->packedCtrlSize); + } + DEBUG_PRINTF("unpacking offset %llu [h%u]\n", xs->offset, + info->horizon); +} + +static +void repeatUnpackRange(const char *src, const struct RepeatInfo *info, + u64a offset, union RepeatControl *ctrl) { + struct RepeatRangeControl *xs = &ctrl->range; + xs->offset = loadPackedRelative(src, offset, info->packedCtrlSize - 1); + xs->num = src[info->packedCtrlSize - 1]; +} + +static +void repeatUnpackBitmap(const char *src, const struct RepeatInfo *info, + u64a offset, union RepeatControl *ctrl) { + struct RepeatBitmapControl *xs = &ctrl->bitmap; + xs->offset = offset > info->repeatMax ? offset - info->repeatMax : 0; + xs->bitmap = partial_load_u64a(src, info->packedCtrlSize); +} + +static +void repeatUnpackSparseOptimalP(const char *src, const struct RepeatInfo *info, + u64a offset, union RepeatControl *ctrl) { + struct RepeatRingControl *xs = &ctrl->ring; + const u32 ring_indices_len = info->patchCount < 254 ? 2 : 4; + const u32 offset_len = info->packedCtrlSize - ring_indices_len; + xs->offset = loadPackedRelative(src, offset, offset_len); + if (ring_indices_len == 4) { + xs->first = unaligned_load_u16(src + offset_len); + xs->last = unaligned_load_u16(src + offset_len + 2); + } else { + const u8 *indices = (const u8 *)src + offset_len; + xs->first = indices[0]; + xs->last = indices[1]; + } +} + +static +void repeatUnpackTrailer(const char *src, const struct RepeatInfo *info, + u64a offset, union RepeatControl *ctrl) { + struct RepeatTrailerControl *xs = &ctrl->trailer; + + u64a v[2]; + unpack_bits_64(v, (const u8 *)src, info->packedFieldSizes, 2); + + xs->offset = offset - v[0] + info->repeatMin; + xs->bitmap = v[1]; + + DEBUG_PRINTF("loaded: xs->offset=%llu, xs->bitmap=0x%llx\n", xs->offset, + xs->bitmap); +} + +void repeatUnpack(const char *src, const struct RepeatInfo *info, u64a offset, + union RepeatControl *ctrl) { + assert(src && info && ctrl); + + switch ((enum RepeatType)info->type) { + case REPEAT_RING: + repeatUnpackRing(src, info, offset, ctrl); + break; + case REPEAT_FIRST: + case REPEAT_LAST: + repeatUnpackOffset(src, info, offset, ctrl); + break; + case REPEAT_RANGE: + repeatUnpackRange(src, info, offset, ctrl); + break; + case REPEAT_BITMAP: + repeatUnpackBitmap(src, info, offset, ctrl); + break; + case REPEAT_SPARSE_OPTIMAL_P: + repeatUnpackSparseOptimalP(src, info, offset, ctrl); + break; + case REPEAT_TRAILER: + repeatUnpackTrailer(src, info, offset, ctrl); + break; + case REPEAT_ALWAYS: + /* nothing to do - no state */ + break; + } +} + +static really_inline +const u64a *getImplTable(const struct RepeatInfo *info) { + const u64a *table = ((const u64a *)(ROUNDUP_PTR( + ((const char *)(info) + + sizeof(*info)), + alignof(u64a)))); + return table; +} + +static +void storeInitialRingTopPatch(const struct RepeatInfo *info, + struct RepeatRingControl *xs, + u8 *state, u64a offset) { + DEBUG_PRINTF("set the first patch, offset=%llu\n", offset); + xs->offset = offset; + + u8 *active = state; + u32 patch_count = info->patchCount; + mmbit_clear(active, patch_count); + mmbit_set(active, patch_count, 0); + + u8 *ring = active + info->patchesOffset; + u32 encoding_size = info->encodingSize; + partial_store_u64a(ring, 1ull, encoding_size); + xs->first = 0; + xs->last = 1; +} + +static +u32 getSparseOptimalTargetValue(const struct RepeatInfo *info, + const u32 tval, u64a *val) { + u32 patch_size = info->patchSize; + const u64a *repeatTable = getImplTable(info); + u32 loc = 0; + DEBUG_PRINTF("val:%llu \n", *val); + for (u32 i = 1; i <= patch_size - tval; i++) { + u64a tmp = repeatTable[patch_size - i]; + if (*val >= tmp) { + *val -= tmp; + loc = i; + i += (info->minPeriod - 1); + } + } + + return loc; +} + +static +u64a sparseLastTop(const struct RepeatInfo *info, + const struct RepeatRingControl *xs, const u8 *state) { + DEBUG_PRINTF("looking for last top\n"); + u32 patch_size = info->patchSize; + u32 patch_count = info->patchCount; + u32 encoding_size = info->encodingSize; + + u32 occ = ringOccupancy(xs, patch_count); + u32 patch = xs->first + occ - 1; + if (patch >= patch_count) { + patch -= patch_count; + } + + DEBUG_PRINTF("patch%u encoding_size%u occ%u\n", patch, encoding_size, occ); + const u8 *ring = state + info->patchesOffset; + u64a val = partial_load_u64a(ring + encoding_size * patch, encoding_size); + + DEBUG_PRINTF("val:%llu\n", val); + const u64a *repeatTable = getImplTable(info); + for (s32 i = patch_size - 1; i >= 0; i--) { + if (val >= repeatTable[i]) { + DEBUG_PRINTF("xs->offset%llu v%u p%llu\n", + xs->offset, i, repeatTable[i]); + return xs->offset + i + (occ - 1) * patch_size; + } + } + + assert(0); + return 0; +} + +u64a repeatLastTopSparseOptimalP(const struct RepeatInfo *info, + const union RepeatControl *ctrl, + const void *state) { + return sparseLastTop(info, &ctrl->ring, state); +} + +u64a repeatNextMatchSparseOptimalP(const struct RepeatInfo *info, + const union RepeatControl *ctrl, + const void *state, u64a offset) { + const struct RepeatRingControl *xs = &ctrl->ring; + + DEBUG_PRINTF("repeat [%u, %u] looking for match after %llu\n", + info->repeatMin, info->repeatMax, offset); + + assert(offset >= xs->offset); + + u64a nextOffset = offset + 1; + + u32 patch_size = info->patchSize; + u32 patch; + u32 tval; + if (nextOffset <= xs->offset + info->repeatMin) { + patch = xs->first; + tval = 0; + } else if (nextOffset > sparseLastTop(info, xs, state) + info->repeatMax) { + DEBUG_PRINTF("ring is stale\n"); + return 0; + } else { + assert(nextOffset - xs->offset < UINT32_MAX); // ring is not stale + u32 delta = (u32)(nextOffset - xs->offset); + u32 lower = delta > info->repeatMax ? delta - info->repeatMax : 0; + patch = lower / patch_size; + tval = lower - patch * patch_size; + } + + DEBUG_PRINTF("patch %u\n", patch); + u32 patch_count = info->patchCount; + if (patch >= patch_count) { + return 0; + } + + DEBUG_PRINTF("initial test for %u\n", tval); + + u32 begin = xs->first + patch; + if (begin >= patch_count) { + begin -= patch_count; + } + + const u8 *active = (const u8 *)state; + const u8 *ring = active + info->patchesOffset; + u32 encoding_size = info->encodingSize; + const u32 end = begin >= xs->last ? patch_count : xs->last; + u32 low = tval; + u64a diff = 0, loc = 0; + DEBUG_PRINTF("begin %u end %u\n", begin, end); + for (u32 p = mmbit_iterate_bounded(active, patch_count, begin, end); + p != MMB_INVALID; p = mmbit_iterate_bounded(active, patch_count, + p + 1, end)) { + if (p != begin) { + low = 0; + } + + u64a val = partial_load_u64a(ring + encoding_size * p, encoding_size); + u32 p1 = 0; + if (p >= xs->first) { + p1 = p - xs->first; + } else { + p1 = p + patch_count - xs->first; + } + + if (val) { + loc = getSparseOptimalTargetValue(info, low, &val); + diff = (p1 + 1) * patch_size - loc; + } + if (loc) { + u64a rv = MAX(nextOffset, xs->offset + info->repeatMin + diff); + DEBUG_PRINTF("offset%llu next match at %llu\n", xs->offset, rv); + return rv; + } + low = 0; + } + + low = 0; + if (begin >= xs->last) { + for (u32 p = mmbit_iterate_bounded(active, patch_count, 0, xs->last); + p != MMB_INVALID; p = mmbit_iterate_bounded(active, patch_count, + p + 1, xs->last)) { + + u64a val = partial_load_u64a(ring + encoding_size * p, + encoding_size); + if (val) { + loc = getSparseOptimalTargetValue(info, low, &val); + diff = (p + 1) * patch_size - loc; + } + if (loc) { + u64a rv = MAX(nextOffset, xs->offset + info->repeatMin + + diff + (end - xs->first) * patch_size); + DEBUG_PRINTF("next match at %llu\n", rv); + return rv; + } + } + } + + DEBUG_PRINTF("next match\n"); + return 0; +} + +void repeatStoreSparseOptimalP(const struct RepeatInfo *info, + union RepeatControl *ctrl, void *state, + u64a offset, char is_alive) { + struct RepeatRingControl *xs = &ctrl->ring; + u8 *active = (u8 *)state; + + DEBUG_PRINTF("offset: %llu encoding_size: %u\n", offset, + info->encodingSize); + + // If (a) this is the first top, or (b) the ring is stale, initialize the + // ring and write this offset in as the first top. + if (!is_alive || + offset > sparseLastTop(info, xs, state) + info->repeatMax) { + storeInitialRingTopPatch(info, xs, active, offset); + return; + } + + // Tops should arrive in order, with no duplicates. + assert(offset > sparseLastTop(info, xs, state)); + + // As the ring is not stale, our delta should fit within a u32. + assert(offset - xs->offset <= UINT32_MAX); + u32 delta = (u32)(offset - xs->offset); + u32 patch_size = info->patchSize; + u32 patch_count = info->patchCount; + u32 encoding_size = info->encodingSize; + u32 patch = delta / patch_size; + + DEBUG_PRINTF("delta=%u, patch_size=%u, patch=%u\n", delta, patch_size, + patch); + + u8 *ring = active + info->patchesOffset; + u32 occ = ringOccupancy(xs, patch_count); + u64a val = 0; + u32 idx; + + DEBUG_PRINTF("patch: %u patch_count: %u occ: %u\n", + patch, patch_count, occ); + if (patch >= patch_count) { + u32 patch_shift_count = patch - patch_count + 1; + assert(patch >= patch_shift_count); + DEBUG_PRINTF("shifting by %u\n", patch_shift_count); + xs->offset += patch_size * patch_shift_count; + xs->first += patch_shift_count; + if (xs->first >= patch_count) { + xs->first -= patch_count; + } + idx = xs->last + patch - occ; + mmbit_unset_range(active, patch_count, xs->last, + MIN(idx, patch_count)); + if (idx >= patch_count) { + idx -= patch_count; + mmbit_unset_range(active, patch_count, 0, idx + 1); + } + xs->last = idx + 1; + if (xs->last == patch_count) { + xs->last = 0; + } + } else if (patch < occ) { + assert(patch == occ - 1); + idx = xs->last == 0 ? patch_count - 1 : (u32)xs->last - 1; + val = partial_load_u64a(ring + encoding_size * idx, encoding_size); + } else { + idx = xs->last + patch - occ; + mmbit_unset_range(active, patch_count, xs->last, + MIN(idx, patch_count)); + if (idx >= patch_count) { + idx -= patch_count; + mmbit_unset_range(active, patch_count, 0, idx + 1); + } + xs->last = idx + 1; + if (xs->last == patch_count) { + xs->last = 0; + } + } + + assert((u64a)patch * patch_size <= delta); + u32 diff = delta - patch * patch_size; + const u64a *repeatTable = getImplTable(info); + val += repeatTable[diff]; + + DEBUG_PRINTF("patch=%u, occ=%u\n", patch, occ); + DEBUG_PRINTF("xs->first:%u xs->last:%u patch:%u\n", + xs->first, xs->last, patch); + DEBUG_PRINTF("value:%llu\n", val); + assert(fits_in_len_bytes(val, encoding_size)); + partial_store_u64a(ring + encoding_size * idx, val, encoding_size); + mmbit_set(active, patch_count, idx); +} + +static +char sparseHasMatch(const struct RepeatInfo *info, const u8 *state, + u32 lower, u32 upper) { + u32 patch_size = info->patchSize; + u32 patch_count = info->patchCount; + u32 encoding_size = info->encodingSize; + u32 patch_lower = lower / patch_size; + u32 patch_upper = upper / patch_size; + u32 diff = lower - patch_lower * patch_size; + + DEBUG_PRINTF("lower=%u, upper=%u\n", lower, upper); + const u64a *repeatTable = getImplTable(info); + + const u8 *ring = state + info->patchesOffset; + const u8 *active = state; + u64a val; + // test the first patch + if (mmbit_isset(active, patch_count, patch_lower)) { + val = partial_load_u64a(ring + encoding_size * patch_lower, + encoding_size); + DEBUG_PRINTF("patch_size=%u, diff=%u, table=%llu\n", + patch_size, diff, repeatTable[diff]); + DEBUG_PRINTF("patch_lower=%u, patch_upper=%u\n", + patch_lower, patch_upper); + if (patch_upper == patch_lower) { + u32 limit = upper - patch_lower * patch_size; + getSparseOptimalTargetValue(info, limit + 1, &val); + } + if (val >= repeatTable[diff]) { + return 1; + } + } + + if (patch_lower == patch_upper) { + return 0; + } + + // test the patches between first and last + u32 m = mmbit_iterate_bounded(active, patch_count, + patch_lower + 1, patch_upper); + if (m != MMB_INVALID) { + return 1; + } + + if (patch_upper == patch_count) { + return 0; + } + + // test the last patch + if (!mmbit_isset(active, patch_count, patch_upper)) { + return 0; + } + diff = (patch_upper + 1) * patch_size - upper; + DEBUG_PRINTF("diff=%u\n", diff); + val = partial_load_u64a(ring + encoding_size * patch_upper, encoding_size); + getSparseOptimalTargetValue(info, patch_size - diff + 1, &val); + if (val) { + DEBUG_PRINTF("last patch: val=%llu\n", val); + return 1; + } + + return 0; +} + +enum RepeatMatch repeatHasMatchSparseOptimalP(const struct RepeatInfo *info, + const union RepeatControl *ctrl, + const void *state, u64a offset) { + DEBUG_PRINTF("check for match at %llu corresponding to trigger " + "at [%llu, %llu]\n", offset, offset - info->repeatMax, + offset - info->repeatMin); + + const struct RepeatRingControl *xs = &ctrl->ring; + const u8 *ring = (const u8 *)state; + + assert(offset >= xs->offset); + + if (offset < xs->offset + info->repeatMin) { + DEBUG_PRINTF("too soon\n"); + return REPEAT_NOMATCH; + } else if (offset > sparseLastTop(info, xs, state) + info->repeatMax) { + DEBUG_PRINTF("stale\n"); + return REPEAT_STALE; + } + + // Our delta between the base offset of the ring and the current offset + // must fit within the range [repeatMin, lastPossibleTop + repeatMax]. This + // range fits comfortably within a u32. + assert(offset - xs->offset <= UINT32_MAX); + + u32 delta = (u32)(offset - xs->offset); + u32 patch_size = info->patchSize; + u32 patch_count = info->patchCount; + u32 occ = ringOccupancy(xs, patch_count); + + u32 lower = delta > info->repeatMax ? delta - info->repeatMax : 0; + u32 upper = MIN(delta - info->repeatMin, occ * patch_size - 1); + + DEBUG_PRINTF("lower=%u, upper=%u\n", lower, upper); + u32 patch_lower = lower / patch_size; + u32 patch_upper = upper / patch_size; + + if (patch_lower >= occ) { + DEBUG_PRINTF("too late\n"); + return REPEAT_NOMATCH; + } + + u32 remaining_lower = lower - patch_lower * patch_size; + u32 remaining_upper = upper - patch_upper * patch_size; + patch_lower += xs->first; + patch_upper += xs->first; + if (patch_lower >= patch_count) { + patch_lower -= patch_count; + patch_upper -= patch_count; + } else if (patch_upper >= patch_count) { + patch_upper -= patch_count; + } + + DEBUG_PRINTF("xs->first:%u xs->last:%u patch_lower:%u, patch_upper:%u\n", + xs->first, xs->last, patch_lower, patch_upper); + + u32 scan_end; + const char is_not_wrapped = (patch_lower <= patch_upper); + if (is_not_wrapped) { + scan_end = patch_upper * patch_size + remaining_upper; + } else { + scan_end = patch_count * patch_size; + } + + lower = patch_lower * patch_size + remaining_lower; + if (sparseHasMatch(info, ring, lower, scan_end)) { + return REPEAT_MATCH; + } + + if (!is_not_wrapped) { + upper -= (patch_count - xs->first) * patch_size; + if (sparseHasMatch(info, ring, 0, upper)) { + return REPEAT_MATCH; + } + } + + return REPEAT_NOMATCH; +} diff --git a/regex/nfa/repeat.h b/regex/nfa/repeat.h new file mode 100644 index 000000000..d4f84ea0a --- /dev/null +++ b/regex/nfa/repeat.h @@ -0,0 +1,370 @@ +/* + * Copyright (c) 2015, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief API for handling bounded repeats. + * + * This file provides an internal API for handling bounded repeats of character + * classes. It is used by the Large Bounded Repeat (LBR) engine and by the + * bounded repeat handling in the LimEx NFA engine as well. + * + * The state required by these functions is split into two regions: + * + * 1. Control block. This is a small structure (size varies with repeat mode) + * that may be copied around or compressed into stream state. + * 2. Repeat state. This is a larger structure that can be quite big for large + * repeats, often containing a multibit ring or large vector of indices. + * This generally lives in stream state and is not copied. + */ + +#ifndef REPEAT_H +#define REPEAT_H + +#include "ue2common.h" +#include "repeat_internal.h" +#include "util/bitutils.h" + +#ifdef __cplusplus +extern "C" +{ +#endif + +/** Returns the offset of the most recent 'top' offset set in the repeat. */ +static really_inline +u64a repeatLastTop(const struct RepeatInfo *info, + const union RepeatControl *ctrl, const void *state); + +/** Returns the offset of the next match after 'offset', or zero if no further + * matches are possible. */ +static really_inline +u64a repeatNextMatch(const struct RepeatInfo *info, + const union RepeatControl *ctrl, const void *state, + u64a offset); + +/** Stores a new top in the repeat. If is_alive is false, the repeat will be + * initialised first and this top will become the first (and only) one. */ +static really_inline +void repeatStore(const struct RepeatInfo *info, union RepeatControl *ctrl, + void *state, u64a offset, char is_alive); + +/** Return type for repeatHasMatch. */ +enum RepeatMatch { + REPEAT_NOMATCH, /**< This offset is not a valid match. */ + REPEAT_MATCH, /**< This offset is a valid match. */ + REPEAT_STALE /**< This offset is not a valid match and no greater + offset will be (unless another top is stored). */ +}; + +/** Query whether the repeat has a match at the given offset. Returns + * ::REPEAT_STALE if it does not have a match at that offset _and_ + * no further matches are possible. */ +static really_inline +enum RepeatMatch repeatHasMatch(const struct RepeatInfo *info, + const union RepeatControl *ctrl, + const void *state, u64a offset); + +/** \brief Serialize a packed version of the repeat control block into stream + * state. */ +void repeatPack(char *dest, const struct RepeatInfo *info, + const union RepeatControl *ctrl, u64a offset); + +/** \brief Deserialize a packed version of the repeat control block. */ +void repeatUnpack(const char *src, const struct RepeatInfo *info, u64a offset, + union RepeatControl *ctrl); + +//// +//// IMPLEMENTATION. +//// + +u64a repeatLastTopRing(const struct RepeatInfo *info, + const union RepeatControl *ctrl); + +u64a repeatLastTopRange(const union RepeatControl *ctrl, + const void *state); + +u64a repeatLastTopBitmap(const union RepeatControl *ctrl); + +u64a repeatLastTopTrailer(const struct RepeatInfo *info, + const union RepeatControl *ctrl); + +u64a repeatLastTopSparseOptimalP(const struct RepeatInfo *info, + const union RepeatControl *ctrl, + const void *state); + +static really_inline +u64a repeatLastTop(const struct RepeatInfo *info, + const union RepeatControl *ctrl, const void *state) { + assert(info && ctrl && state); + + switch ((enum RepeatType)info->type) { + case REPEAT_RING: + return repeatLastTopRing(info, ctrl); + case REPEAT_FIRST: + case REPEAT_LAST: + return ctrl->offset.offset; + case REPEAT_RANGE: + return repeatLastTopRange(ctrl, state); + case REPEAT_BITMAP: + return repeatLastTopBitmap(ctrl); + case REPEAT_SPARSE_OPTIMAL_P: + return repeatLastTopSparseOptimalP(info, ctrl, state); + case REPEAT_TRAILER: + return repeatLastTopTrailer(info, ctrl); + case REPEAT_ALWAYS: + return 0; + } + + DEBUG_PRINTF("bad repeat type %u\n", info->type); + assert(0); + return 0; +} + +// Used for both FIRST and LAST models. +static really_inline +u64a repeatNextMatchOffset(const struct RepeatInfo *info, + const union RepeatControl *ctrl, u64a offset) { + u64a first = ctrl->offset.offset + info->repeatMin; + if (offset < first) { + return first; + } + + if (info->repeatMax == REPEAT_INF || + offset < ctrl->offset.offset + info->repeatMax) { + return offset + 1; + } + + return 0; // No more matches. +} + +u64a repeatNextMatchRing(const struct RepeatInfo *info, + const union RepeatControl *ctrl, + const void *state, u64a offset); + +u64a repeatNextMatchRange(const struct RepeatInfo *info, + const union RepeatControl *ctrl, + const void *state, u64a offset); + +u64a repeatNextMatchBitmap(const struct RepeatInfo *info, + const union RepeatControl *ctrl, u64a offset); + +u64a repeatNextMatchSparseOptimalP(const struct RepeatInfo *info, + const union RepeatControl *ctrl, + const void *state, u64a offset); + +u64a repeatNextMatchTrailer(const struct RepeatInfo *info, + const union RepeatControl *ctrl, u64a offset); + +static really_inline +u64a repeatNextMatch(const struct RepeatInfo *info, + const union RepeatControl *ctrl, const void *state, + u64a offset) { + assert(info && ctrl && state); + assert(ISALIGNED(info)); + assert(ISALIGNED(ctrl)); + + switch ((enum RepeatType)info->type) { + case REPEAT_RING: + return repeatNextMatchRing(info, ctrl, state, offset); + case REPEAT_FIRST: + // fall through + case REPEAT_LAST: + return repeatNextMatchOffset(info, ctrl, offset); + case REPEAT_RANGE: + return repeatNextMatchRange(info, ctrl, state, offset); + case REPEAT_BITMAP: + return repeatNextMatchBitmap(info, ctrl, offset); + case REPEAT_SPARSE_OPTIMAL_P: + return repeatNextMatchSparseOptimalP(info, ctrl, state, offset); + case REPEAT_TRAILER: + return repeatNextMatchTrailer(info, ctrl, offset); + case REPEAT_ALWAYS: + return offset + 1; + } + + DEBUG_PRINTF("bad repeat type %u\n", info->type); + assert(0); + return 0; +} + +static really_inline +void repeatStoreFirst(union RepeatControl *ctrl, u64a offset, + char is_alive) { + if (is_alive) { + return; + } + ctrl->offset.offset = offset; +} + +static really_inline +void repeatStoreLast(union RepeatControl *ctrl, u64a offset, + UNUSED char is_alive) { + assert(!is_alive || offset >= ctrl->offset.offset); + ctrl->offset.offset = offset; +} + +void repeatStoreRing(const struct RepeatInfo *info, + union RepeatControl *ctrl, void *state, u64a offset, + char is_alive); + +void repeatStoreRange(const struct RepeatInfo *info, + union RepeatControl *ctrl, void *state, u64a offset, + char is_alive); + +void repeatStoreBitmap(const struct RepeatInfo *info, + union RepeatControl *ctrl, u64a offset, + char is_alive); + +void repeatStoreSparseOptimalP(const struct RepeatInfo *info, + union RepeatControl *ctrl, void *state, + u64a offset, char is_alive); + +void repeatStoreTrailer(const struct RepeatInfo *info, + union RepeatControl *ctrl, u64a offset, + char is_alive); + +static really_inline +void repeatStore(const struct RepeatInfo *info, union RepeatControl *ctrl, + void *state, u64a offset, char is_alive) { + assert(info && ctrl && state); + assert(ISALIGNED(info)); + assert(ISALIGNED(ctrl)); + + assert(info->repeatMin <= info->repeatMax); + assert(info->repeatMax <= REPEAT_INF); + + switch ((enum RepeatType)info->type) { + case REPEAT_RING: + repeatStoreRing(info, ctrl, state, offset, is_alive); + break; + case REPEAT_FIRST: + repeatStoreFirst(ctrl, offset, is_alive); + break; + case REPEAT_LAST: + repeatStoreLast(ctrl, offset, is_alive); + break; + case REPEAT_RANGE: + repeatStoreRange(info, ctrl, state, offset, is_alive); + break; + case REPEAT_BITMAP: + repeatStoreBitmap(info, ctrl, offset, is_alive); + break; + case REPEAT_SPARSE_OPTIMAL_P: + repeatStoreSparseOptimalP(info, ctrl, state, offset, is_alive); + break; + case REPEAT_TRAILER: + repeatStoreTrailer(info, ctrl, offset, is_alive); + break; + case REPEAT_ALWAYS: + /* nothing to do - no state */ + break; + } +} + +static really_inline +enum RepeatMatch repeatHasMatchFirst(const struct RepeatInfo *info, + const union RepeatControl *ctrl, + u64a offset) { + if (offset < ctrl->offset.offset + info->repeatMin) { + return REPEAT_NOMATCH; + } + + // FIRST models are {N,} repeats, i.e. they always have inf max depth. + assert(info->repeatMax == REPEAT_INF); + return REPEAT_MATCH; +} + +static really_inline +enum RepeatMatch repeatHasMatchLast(const struct RepeatInfo *info, + const union RepeatControl *ctrl, + u64a offset) { + if (offset < ctrl->offset.offset + info->repeatMin) { + return REPEAT_NOMATCH; + } + assert(info->repeatMax < REPEAT_INF); + if (offset <= ctrl->offset.offset + info->repeatMax) { + return REPEAT_MATCH; + } + return REPEAT_STALE; +} + +enum RepeatMatch repeatHasMatchRing(const struct RepeatInfo *info, + const union RepeatControl *ctrl, + const void *state, u64a offset); + +enum RepeatMatch repeatHasMatchRange(const struct RepeatInfo *info, + const union RepeatControl *ctrl, + const void *state, u64a offset); + +enum RepeatMatch repeatHasMatchSparseOptimalP(const struct RepeatInfo *info, + const union RepeatControl *ctrl, + const void *state, u64a offset); + +enum RepeatMatch repeatHasMatchBitmap(const struct RepeatInfo *info, + const union RepeatControl *ctrl, + u64a offset); + +enum RepeatMatch repeatHasMatchTrailer(const struct RepeatInfo *info, + const union RepeatControl *ctrl, + u64a offset); + +static really_inline +enum RepeatMatch repeatHasMatch(const struct RepeatInfo *info, + const union RepeatControl *ctrl, + const void *state, u64a offset) { + assert(info && ctrl && state); + assert(ISALIGNED(info)); + assert(ISALIGNED(ctrl)); + + switch ((enum RepeatType)info->type) { + case REPEAT_RING: + return repeatHasMatchRing(info, ctrl, state, offset); + case REPEAT_FIRST: + return repeatHasMatchFirst(info, ctrl, offset); + case REPEAT_LAST: + return repeatHasMatchLast(info, ctrl, offset); + case REPEAT_RANGE: + return repeatHasMatchRange(info, ctrl, state, offset); + case REPEAT_BITMAP: + return repeatHasMatchBitmap(info, ctrl, offset); + case REPEAT_SPARSE_OPTIMAL_P: + return repeatHasMatchSparseOptimalP(info, ctrl, state, offset); + case REPEAT_TRAILER: + return repeatHasMatchTrailer(info, ctrl, offset); + case REPEAT_ALWAYS: + return REPEAT_MATCH; + } + + assert(0); + return REPEAT_NOMATCH; +} + +#ifdef __cplusplus +} +#endif + +#endif // REPEAT_H diff --git a/regex/nfa/repeat_internal.h b/regex/nfa/repeat_internal.h new file mode 100644 index 000000000..9e3f455c8 --- /dev/null +++ b/regex/nfa/repeat_internal.h @@ -0,0 +1,218 @@ +/* + * Copyright (c) 2015, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef REPEAT_INTERNAL_H +#define REPEAT_INTERNAL_H + +#include "ue2common.h" + +/** \file + * \brief Bounded Repeat models. + * + * Used by the NFA, to represent bounded repeats managed via special POS and + * TUG exceptions, and by the LBR (limited bounded repeat) and Castle + * specialist engines. + * + * We currently have a number of different kinds of bounded repeat model, for + * different kinds of {N,M} repeats, described by ::RepeatType. + */ + +/** Different types of bounded repeats. */ +enum RepeatType { + /** General mechanism for tracking {N,M} repeats. Stores the first top as + * an absolute offset, then subsequent tops in the {N,M} range as a ring of + * relative top indices stored in a multibit. */ + REPEAT_RING, + + /** Used to track {N,} repeats. Uses the \ref RepeatOffsetControl structure, + * since only the first top encountered needs to be stored. */ + REPEAT_FIRST, + + /** Used to track {0,N} repeats. Much like ::REPEAT_FIRST, except that we + * store the most recent top encountered. */ + REPEAT_LAST, + + /** Like ::REPEAT_RING, this is also used for {N,M} repeats, but for cases + * where there is a large difference between N and M, and developed to + * reduce the state requirements of this case (relative to the RING model). + * Uses a small ordered array of top indices relative to \ref + * RepeatRangeControl::offset. */ + REPEAT_RANGE, + + /** Used for {N,M} repeats where 0 < M <= 64. Uses the \ref + * RepeatBitmapControl structure at runtime. */ + REPEAT_BITMAP, + + /** Optimal mechanism for tracking {N,M} repeats when there is a bound on + * how frequently they can be retriggered. + * Assume f(repeat, min) representing the number of possible bit patterns + * we can have for repeat size = repeat, minimum period = min + * We will have the following recurrence relation: + * f(repeat, min) = f(repeat - 1, min) + f(repeat - min, min); + * We use this recurrence to encode bit patterns with 64-bit values by + * referencing a table that stores values from f(0, min) to f(repeat, min) + * eg: repeat = 5, min = 2. 10001 => f(4,2) + f(0,2) = 9. + * We search the optimal patch size between min and repeat in advance and + * use the scheme above to do encoding and decoding to reduce stream state + * size. */ + REPEAT_SPARSE_OPTIMAL_P, + + /** Used for {N,M} repeats where 0 < N < 64. Uses the + * \ref RepeatTrailerControl structure at runtime. */ + REPEAT_TRAILER, + + /** Degenerate repeat that always returns true. Used by castle for pseudo + * [^X]* repeats. */ + REPEAT_ALWAYS, +}; + +/** + * \brief Value used to represent an unbounded max repeat. + * + * Note that we do not support \ref RepeatInfo::repeatMax values larger than + * this. + */ +#define REPEAT_INF 65535 + +/** Max slots used by ::REPEAT_RANGE repeat model. */ +#define REPEAT_RANGE_MAX_SLOTS 16 + +/** Structure describing a bounded repeat in the bytecode */ +struct RepeatInfo { + u8 type; //!< from enum RepeatType. + u32 repeatMin; //!< minimum number of repeats. + u32 repeatMax; //!< maximum number of repeats, or REPEAT_INF if unbounded. + + /** Maximum value that is required to be stored in the control block + * counters. Any value greater than this will be capped at the horizon. + */ + u32 horizon; + + /** Size of the compressed control block in bytes. This is what is written + * out to stream state at stream boundaries. */ + u32 packedCtrlSize; + + /** Size of the repeat state block in bytes. This is where the REPEAT_RANGE + * vector and REPEAT_RING multibit are stored, in stream state, and they + * are manipulated directly (i.e. not copied at stream boundaries). */ + u32 stateSize; + + /** How soon after one trigger we can see the next trigger. + * Used by REPEAT_SPARSE_OPTIMAL_P. */ + u32 minPeriod; + + /** Packed control block field sizes (in bits), used by REPEAT_TRAILER. */ + u32 packedFieldSizes[2]; + + /* Number of patches, used by REPEAT_SPARSE_OPTIMAL_P. */ + u32 patchCount; + + /* Optimal patch length, used by REPEAT_SPARSE_OPTIMAL_P. */ + u32 patchSize; + + /* Encoding patch length in bytes, used by REPEAT_SPARSE_OPTIMAL_P. */ + u32 encodingSize; + + /* RepeatInfo struct length including table size. */ + u32 length; + + /** Offset of patches relative to the start of repeat stream state, + * used by REPEAT_SPARSE_OPTIMAL_P. */ + u32 patchesOffset; +}; + +/** Runtime control block structure for ::REPEAT_RING and + * ::REPEAT_SPARSE_OPTIMAL_P bounded repeats. Note that this struct is packed + * (may not be aligned). */ +struct RepeatRingControl { + u64a offset; //!< index of first top. + u16 first; //!< start index in ring. + u16 last; //!< end index in ring. +}; + +/** Runtime control block structure for ::REPEAT_RANGE bounded repeats. Note + * that this struct is packed (may not be aligned). */ +struct RepeatRangeControl { + u64a offset; //!< index of first top. + u8 num; //!< number of elements in array. +}; + +/** Runtime control block structure for cases where only a single offset is + * needed to track the repeat, both ::REPEAT_FIRST and ::REPEAT_LAST. Note that + * this struct is packed (may not be aligned). */ +struct RepeatOffsetControl { + u64a offset; //!< index of a top. +}; + +/** Runtime control block structure for ::REPEAT_BITMAP bounded repeats. */ +struct RepeatBitmapControl { + u64a offset; //!< index of first top. + u64a bitmap; //!< forward bitmap of tops relative to base offset. +}; + +/** Runtime control block structure for ::REPEAT_TRAILER bounded repeats. */ +struct RepeatTrailerControl { + u64a offset; //!< min extent of most recent match window. + u64a bitmap; //!< trailing bitmap of earlier matches, relative to offset. +}; + +/** \brief Union of control block types, used at runtime. */ +union RepeatControl { + struct RepeatRingControl ring; + struct RepeatRangeControl range; + struct RepeatOffsetControl offset; + struct RepeatBitmapControl bitmap; + struct RepeatTrailerControl trailer; +}; + +/** For debugging, returns the name of a repeat model. */ +static really_inline UNUSED +const char *repeatTypeName(u8 type) { + switch ((enum RepeatType)type) { + case REPEAT_RING: + return "RING"; + case REPEAT_FIRST: + return "FIRST"; + case REPEAT_LAST: + return "LAST"; + case REPEAT_RANGE: + return "RANGE"; + case REPEAT_BITMAP: + return "BITMAP"; + case REPEAT_SPARSE_OPTIMAL_P: + return "SPARSE_OPTIMAL_P"; + case REPEAT_TRAILER: + return "TRAILER"; + case REPEAT_ALWAYS: + return "ALWAYS"; + } + assert(0); + return "UNKNOWN"; +} + +#endif // REPEAT_INTERNAL_H diff --git a/regex/nfa/sheng.c b/regex/nfa/sheng.c new file mode 100644 index 000000000..3f36e2189 --- /dev/null +++ b/regex/nfa/sheng.c @@ -0,0 +1,1877 @@ +/* + * Copyright (c) 2016-2020, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "sheng.h" + +#include "accel.h" +#include "sheng_internal.h" +#include "nfa_api.h" +#include "nfa_api_queue.h" +#include "nfa_internal.h" +#include "util/bitutils.h" +#include "util/compare.h" +#include "util/join.h" +#include "util/simd_utils.h" + +enum MatchMode { + CALLBACK_OUTPUT, + STOP_AT_MATCH, + NO_MATCHES +}; + +static really_inline +const struct sheng *get_sheng(const struct NFA *n) { + return (const struct sheng *)getImplNfa(n); +} + +static really_inline +const struct sstate_aux *get_aux(const struct sheng *sh, u8 id) { + u32 offset = sh->aux_offset - sizeof(struct NFA) + + (id & SHENG_STATE_MASK) * sizeof(struct sstate_aux); + DEBUG_PRINTF("Getting aux for state %u at offset %llu\n", + id & SHENG_STATE_MASK, (u64a)offset + sizeof(struct NFA)); + return (const struct sstate_aux *)((const char *) sh + offset); +} + +static really_inline +const union AccelAux *get_accel(const struct sheng *sh, u8 id) { + const struct sstate_aux *saux = get_aux(sh, id); + DEBUG_PRINTF("Getting accel aux at offset %u\n", saux->accel); + const union AccelAux *aux = (const union AccelAux *) + ((const char *)sh + saux->accel - sizeof(struct NFA)); + return aux; +} + +static really_inline +const struct report_list *get_rl(const struct sheng *sh, + const struct sstate_aux *aux) { + DEBUG_PRINTF("Getting report list at offset %u\n", aux->accept); + return (const struct report_list *) + ((const char *)sh + aux->accept - sizeof(struct NFA)); +} + +static really_inline +const struct report_list *get_eod_rl(const struct sheng *sh, + const struct sstate_aux *aux) { + DEBUG_PRINTF("Getting EOD report list at offset %u\n", aux->accept); + return (const struct report_list *) + ((const char *)sh + aux->accept_eod - sizeof(struct NFA)); +} + +static really_inline +char shengHasAccept(const struct sheng *sh, const struct sstate_aux *aux, + ReportID report) { + assert(sh && aux); + + const struct report_list *rl = get_rl(sh, aux); + assert(ISALIGNED_N(rl, 4)); + + DEBUG_PRINTF("report list has %u entries\n", rl->count); + + for (u32 i = 0; i < rl->count; i++) { + if (rl->report[i] == report) { + DEBUG_PRINTF("reporting %u\n", rl->report[i]); + return 1; + } + } + + return 0; +} + +static really_inline +char fireSingleReport(NfaCallback cb, void *ctxt, ReportID r, u64a loc) { + DEBUG_PRINTF("reporting %u\n", r); + if (cb(0, loc, r, ctxt) == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; /* termination requested */ + } + return MO_CONTINUE_MATCHING; /* continue execution */ +} + +static really_inline +char fireReports(const struct sheng *sh, NfaCallback cb, void *ctxt, + const u8 state, u64a loc, u8 *const cached_accept_state, + ReportID *const cached_accept_id, char eod) { + DEBUG_PRINTF("reporting matches @ %llu\n", loc); + + if (!eod && state == *cached_accept_state) { + DEBUG_PRINTF("reporting %u\n", *cached_accept_id); + if (cb(0, loc, *cached_accept_id, ctxt) == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; /* termination requested */ + } + + return MO_CONTINUE_MATCHING; /* continue execution */ + } + const struct sstate_aux *aux = get_aux(sh, state); + const struct report_list *rl = eod ? get_eod_rl(sh, aux) : get_rl(sh, aux); + assert(ISALIGNED(rl)); + + DEBUG_PRINTF("report list has %u entries\n", rl->count); + u32 count = rl->count; + + if (!eod && count == 1) { + *cached_accept_state = state; + *cached_accept_id = rl->report[0]; + + DEBUG_PRINTF("reporting %u\n", rl->report[0]); + if (cb(0, loc, rl->report[0], ctxt) == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; /* termination requested */ + } + + return MO_CONTINUE_MATCHING; /* continue execution */ + } + + for (u32 i = 0; i < count; i++) { + DEBUG_PRINTF("reporting %u\n", rl->report[i]); + if (cb(0, loc, rl->report[i], ctxt) == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; /* termination requested */ + } + } + return MO_CONTINUE_MATCHING; /* continue execution */ +} + +#if defined(HAVE_AVX512VBMI) +// Sheng32 +static really_inline +const struct sheng32 *get_sheng32(const struct NFA *n) { + return (const struct sheng32 *)getImplNfa(n); +} + +static really_inline +const struct sstate_aux *get_aux32(const struct sheng32 *sh, u8 id) { + u32 offset = sh->aux_offset - sizeof(struct NFA) + + (id & SHENG32_STATE_MASK) * sizeof(struct sstate_aux); + DEBUG_PRINTF("Getting aux for state %u at offset %llu\n", + id & SHENG32_STATE_MASK, (u64a)offset + sizeof(struct NFA)); + return (const struct sstate_aux *)((const char *) sh + offset); +} + +static really_inline +const union AccelAux *get_accel32(const struct sheng32 *sh, u8 id) { + const struct sstate_aux *saux = get_aux32(sh, id); + DEBUG_PRINTF("Getting accel aux at offset %u\n", saux->accel); + const union AccelAux *aux = (const union AccelAux *) + ((const char *)sh + saux->accel - sizeof(struct NFA)); + return aux; +} + +static really_inline +const struct report_list *get_rl32(const struct sheng32 *sh, + const struct sstate_aux *aux) { + DEBUG_PRINTF("Getting report list at offset %u\n", aux->accept); + return (const struct report_list *) + ((const char *)sh + aux->accept - sizeof(struct NFA)); +} + +static really_inline +const struct report_list *get_eod_rl32(const struct sheng32 *sh, + const struct sstate_aux *aux) { + DEBUG_PRINTF("Getting EOD report list at offset %u\n", aux->accept); + return (const struct report_list *) + ((const char *)sh + aux->accept_eod - sizeof(struct NFA)); +} + +static really_inline +char sheng32HasAccept(const struct sheng32 *sh, const struct sstate_aux *aux, + ReportID report) { + assert(sh && aux); + + const struct report_list *rl = get_rl32(sh, aux); + assert(ISALIGNED_N(rl, 4)); + + DEBUG_PRINTF("report list has %u entries\n", rl->count); + + for (u32 i = 0; i < rl->count; i++) { + if (rl->report[i] == report) { + DEBUG_PRINTF("reporting %u\n", rl->report[i]); + return 1; + } + } + + return 0; +} + +static really_inline +char fireReports32(const struct sheng32 *sh, NfaCallback cb, void *ctxt, + const u8 state, u64a loc, u8 *const cached_accept_state, + ReportID *const cached_accept_id, char eod) { + DEBUG_PRINTF("reporting matches @ %llu\n", loc); + + if (!eod && state == *cached_accept_state) { + DEBUG_PRINTF("reporting %u\n", *cached_accept_id); + if (cb(0, loc, *cached_accept_id, ctxt) == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; /* termination requested */ + } + + return MO_CONTINUE_MATCHING; /* continue execution */ + } + const struct sstate_aux *aux = get_aux32(sh, state); + const struct report_list *rl = eod ? get_eod_rl32(sh, aux) : + get_rl32(sh, aux); + assert(ISALIGNED(rl)); + + DEBUG_PRINTF("report list has %u entries\n", rl->count); + u32 count = rl->count; + + if (!eod && count == 1) { + *cached_accept_state = state; + *cached_accept_id = rl->report[0]; + + DEBUG_PRINTF("reporting %u\n", rl->report[0]); + if (cb(0, loc, rl->report[0], ctxt) == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; /* termination requested */ + } + + return MO_CONTINUE_MATCHING; /* continue execution */ + } + + for (u32 i = 0; i < count; i++) { + DEBUG_PRINTF("reporting %u\n", rl->report[i]); + if (cb(0, loc, rl->report[i], ctxt) == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; /* termination requested */ + } + } + return MO_CONTINUE_MATCHING; /* continue execution */ +} + +// Sheng64 +static really_inline +const struct sheng64 *get_sheng64(const struct NFA *n) { + return (const struct sheng64 *)getImplNfa(n); +} + +static really_inline +const struct sstate_aux *get_aux64(const struct sheng64 *sh, u8 id) { + u32 offset = sh->aux_offset - sizeof(struct NFA) + + (id & SHENG64_STATE_MASK) * sizeof(struct sstate_aux); + DEBUG_PRINTF("Getting aux for state %u at offset %llu\n", + id & SHENG64_STATE_MASK, (u64a)offset + sizeof(struct NFA)); + return (const struct sstate_aux *)((const char *) sh + offset); +} + +static really_inline +const struct report_list *get_rl64(const struct sheng64 *sh, + const struct sstate_aux *aux) { + DEBUG_PRINTF("Getting report list at offset %u\n", aux->accept); + return (const struct report_list *) + ((const char *)sh + aux->accept - sizeof(struct NFA)); +} + +static really_inline +const struct report_list *get_eod_rl64(const struct sheng64 *sh, + const struct sstate_aux *aux) { + DEBUG_PRINTF("Getting EOD report list at offset %u\n", aux->accept); + return (const struct report_list *) + ((const char *)sh + aux->accept_eod - sizeof(struct NFA)); +} + +static really_inline +char sheng64HasAccept(const struct sheng64 *sh, const struct sstate_aux *aux, + ReportID report) { + assert(sh && aux); + + const struct report_list *rl = get_rl64(sh, aux); + assert(ISALIGNED_N(rl, 4)); + + DEBUG_PRINTF("report list has %u entries\n", rl->count); + + for (u32 i = 0; i < rl->count; i++) { + if (rl->report[i] == report) { + DEBUG_PRINTF("reporting %u\n", rl->report[i]); + return 1; + } + } + + return 0; +} + +static really_inline +char fireReports64(const struct sheng64 *sh, NfaCallback cb, void *ctxt, + const u8 state, u64a loc, u8 *const cached_accept_state, + ReportID *const cached_accept_id, char eod) { + DEBUG_PRINTF("reporting matches @ %llu\n", loc); + + if (!eod && state == *cached_accept_state) { + DEBUG_PRINTF("reporting %u\n", *cached_accept_id); + if (cb(0, loc, *cached_accept_id, ctxt) == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; /* termination requested */ + } + + return MO_CONTINUE_MATCHING; /* continue execution */ + } + const struct sstate_aux *aux = get_aux64(sh, state); + const struct report_list *rl = eod ? get_eod_rl64(sh, aux) : + get_rl64(sh, aux); + assert(ISALIGNED(rl)); + + DEBUG_PRINTF("report list has %u entries\n", rl->count); + u32 count = rl->count; + + if (!eod && count == 1) { + *cached_accept_state = state; + *cached_accept_id = rl->report[0]; + + DEBUG_PRINTF("reporting %u\n", rl->report[0]); + if (cb(0, loc, rl->report[0], ctxt) == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; /* termination requested */ + } + + return MO_CONTINUE_MATCHING; /* continue execution */ + } + + for (u32 i = 0; i < count; i++) { + DEBUG_PRINTF("reporting %u\n", rl->report[i]); + if (cb(0, loc, rl->report[i], ctxt) == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; /* termination requested */ + } + } + return MO_CONTINUE_MATCHING; /* continue execution */ +} +#endif // end of HAVE_AVX512VBMI + +/* include Sheng function definitions */ +#include "sheng_defs.h" + +static really_inline +char runShengCb(const struct sheng *sh, NfaCallback cb, void *ctxt, u64a offset, + u8 *const cached_accept_state, ReportID *const cached_accept_id, + const u8 *cur_buf, const u8 *start, const u8 *end, u8 can_die, + u8 has_accel, u8 single, const u8 **scanned, u8 *state) { + DEBUG_PRINTF("Scanning %llu bytes (offset %llu) in callback mode\n", + (u64a)(end - start), offset); + DEBUG_PRINTF("start: %lli end: %lli\n", (s64a)(start - cur_buf), + (s64a)(end - cur_buf)); + DEBUG_PRINTF("can die: %u has accel: %u single: %u\n", !!can_die, + !!has_accel, !!single); + int rv; + /* scan and report all matches */ + if (can_die) { + if (has_accel) { + rv = sheng4_coda(state, cb, ctxt, sh, cached_accept_state, + cached_accept_id, single, offset, cur_buf, start, + end, scanned); + } else { + rv = sheng4_cod(state, cb, ctxt, sh, cached_accept_state, + cached_accept_id, single, offset, cur_buf, start, + end, scanned); + } + if (rv == MO_HALT_MATCHING) { + return MO_DEAD; + } + rv = sheng_cod(state, cb, ctxt, sh, cached_accept_state, + cached_accept_id, single, offset, cur_buf, *scanned, end, + scanned); + } else { + if (has_accel) { + rv = sheng4_coa(state, cb, ctxt, sh, cached_accept_state, + cached_accept_id, single, offset, cur_buf, start, + end, scanned); + } else { + rv = sheng4_co(state, cb, ctxt, sh, cached_accept_state, + cached_accept_id, single, offset, cur_buf, start, + end, scanned); + } + if (rv == MO_HALT_MATCHING) { + return MO_DEAD; + } + rv = sheng_co(state, cb, ctxt, sh, cached_accept_state, + cached_accept_id, single, offset, cur_buf, *scanned, end, + scanned); + } + if (rv == MO_HALT_MATCHING) { + return MO_DEAD; + } + return MO_ALIVE; +} + +static really_inline +void runShengNm(const struct sheng *sh, NfaCallback cb, void *ctxt, u64a offset, + u8 *const cached_accept_state, ReportID *const cached_accept_id, + const u8 *cur_buf, const u8 *start, const u8 *end, u8 can_die, + u8 has_accel, u8 single, const u8 **scanned, u8 *state) { + DEBUG_PRINTF("Scanning %llu bytes (offset %llu) in nomatch mode\n", + (u64a)(end - start), offset); + DEBUG_PRINTF("start: %lli end: %lli\n", (s64a)(start - cur_buf), + (s64a)(end - cur_buf)); + DEBUG_PRINTF("can die: %u has accel: %u single: %u\n", !!can_die, + !!has_accel, !!single); + /* just scan the buffer */ + if (can_die) { + if (has_accel) { + sheng4_nmda(state, cb, ctxt, sh, cached_accept_state, + cached_accept_id, single, offset, cur_buf, start, end, + scanned); + } else { + sheng4_nmd(state, cb, ctxt, sh, cached_accept_state, + cached_accept_id, single, offset, cur_buf, start, end, + scanned); + } + sheng_nmd(state, cb, ctxt, sh, cached_accept_state, cached_accept_id, + single, offset, cur_buf, *scanned, end, scanned); + } else { + sheng4_nm(state, cb, ctxt, sh, cached_accept_state, cached_accept_id, + single, offset, cur_buf, start, end, scanned); + sheng_nm(state, cb, ctxt, sh, cached_accept_state, cached_accept_id, + single, offset, cur_buf, *scanned, end, scanned); + } +} + +static really_inline +char runShengSam(const struct sheng *sh, NfaCallback cb, void *ctxt, + u64a offset, u8 *const cached_accept_state, + ReportID *const cached_accept_id, const u8 *cur_buf, + const u8 *start, const u8 *end, u8 can_die, u8 has_accel, + u8 single, const u8 **scanned, u8 *state) { + DEBUG_PRINTF("Scanning %llu bytes (offset %llu) in stop at match mode\n", + (u64a)(end - start), offset); + DEBUG_PRINTF("start: %lli end: %lli\n", (s64a)(start - cur_buf), + (s64a)(end - cur_buf)); + DEBUG_PRINTF("can die: %u has accel: %u single: %u\n", !!can_die, + !!has_accel, !!single); + int rv; + /* scan until first match */ + if (can_die) { + if (has_accel) { + rv = sheng4_samda(state, cb, ctxt, sh, cached_accept_state, + cached_accept_id, single, offset, cur_buf, start, + end, scanned); + } else { + rv = sheng4_samd(state, cb, ctxt, sh, cached_accept_state, + cached_accept_id, single, offset, cur_buf, start, + end, scanned); + } + if (rv == MO_HALT_MATCHING) { + return MO_DEAD; + } + /* if we stopped before we expected, we found a match */ + if (rv == MO_MATCHES_PENDING) { + return MO_MATCHES_PENDING; + } + + rv = sheng_samd(state, cb, ctxt, sh, cached_accept_state, + cached_accept_id, single, offset, cur_buf, *scanned, + end, scanned); + } else { + if (has_accel) { + rv = sheng4_sama(state, cb, ctxt, sh, cached_accept_state, + cached_accept_id, single, offset, cur_buf, start, + end, scanned); + } else { + rv = sheng4_sam(state, cb, ctxt, sh, cached_accept_state, + cached_accept_id, single, offset, cur_buf, start, + end, scanned); + } + if (rv == MO_HALT_MATCHING) { + return MO_DEAD; + } + /* if we stopped before we expected, we found a match */ + if (rv == MO_MATCHES_PENDING) { + return MO_MATCHES_PENDING; + } + + rv = sheng_sam(state, cb, ctxt, sh, cached_accept_state, + cached_accept_id, single, offset, cur_buf, *scanned, end, + scanned); + } + if (rv == MO_HALT_MATCHING) { + return MO_DEAD; + } + /* if we stopped before we expected, we found a match */ + if (rv == MO_MATCHES_PENDING) { + return MO_MATCHES_PENDING; + } + return MO_ALIVE; +} + +static never_inline +char runSheng(const struct sheng *sh, struct mq *q, s64a b_end, + enum MatchMode mode) { + u8 state = *(u8 *)q->state; + u8 can_die = sh->flags & SHENG_FLAG_CAN_DIE; + u8 has_accel = sh->flags & SHENG_FLAG_HAS_ACCEL; + u8 single = sh->flags & SHENG_FLAG_SINGLE_REPORT; + + u8 cached_accept_state = 0; + ReportID cached_accept_id = 0; + + DEBUG_PRINTF("starting Sheng execution in state %u\n", + state & SHENG_STATE_MASK); + + if (q->report_current) { + DEBUG_PRINTF("reporting current pending matches\n"); + assert(sh); + + q->report_current = 0; + + int rv; + if (single) { + rv = fireSingleReport(q->cb, q->context, sh->report, + q_cur_offset(q)); + } else { + rv = fireReports(sh, q->cb, q->context, state, q_cur_offset(q), + &cached_accept_state, &cached_accept_id, 0); + } + if (rv == MO_HALT_MATCHING) { + DEBUG_PRINTF("exiting in state %u\n", state & SHENG_STATE_MASK); + return MO_DEAD; + } + + DEBUG_PRINTF("proceeding with matching\n"); + } + + assert(q_cur_type(q) == MQE_START); + s64a start = q_cur_loc(q); + + DEBUG_PRINTF("offset: %lli, location: %lli, mode: %s\n", q->offset, start, + mode == CALLBACK_OUTPUT ? "CALLBACK OUTPUT" : + mode == NO_MATCHES ? "NO MATCHES" : + mode == STOP_AT_MATCH ? "STOP AT MATCH" : "???"); + + DEBUG_PRINTF("processing event @ %lli: %s\n", q->offset + q_cur_loc(q), + q_cur_type(q) == MQE_START ? "START" : + q_cur_type(q) == MQE_TOP ? "TOP" : + q_cur_type(q) == MQE_END ? "END" : "???"); + + const u8* cur_buf; + if (start < 0) { + DEBUG_PRINTF("negative location, scanning history\n"); + DEBUG_PRINTF("min location: %zd\n", -q->hlength); + cur_buf = q->history + q->hlength; + } else { + DEBUG_PRINTF("positive location, scanning buffer\n"); + DEBUG_PRINTF("max location: %lli\n", b_end); + cur_buf = q->buffer; + } + + /* if we our queue event is past our end */ + if (mode != NO_MATCHES && q_cur_loc(q) > b_end) { + DEBUG_PRINTF("current location past buffer end\n"); + DEBUG_PRINTF("setting q location to %llu\n", b_end); + DEBUG_PRINTF("exiting in state %u\n", state & SHENG_STATE_MASK); + q->items[q->cur].location = b_end; + return MO_ALIVE; + } + + q->cur++; + + s64a cur_start = start; + + while (1) { + DEBUG_PRINTF("processing event @ %lli: %s\n", q->offset + q_cur_loc(q), + q_cur_type(q) == MQE_START ? "START" : + q_cur_type(q) == MQE_TOP ? "TOP" : + q_cur_type(q) == MQE_END ? "END" : "???"); + s64a end = q_cur_loc(q); + if (mode != NO_MATCHES) { + end = MIN(end, b_end); + } + assert(end <= (s64a) q->length); + s64a cur_end = end; + + /* we may cross the border between history and current buffer */ + if (cur_start < 0) { + cur_end = MIN(0, cur_end); + } + + DEBUG_PRINTF("start: %lli end: %lli\n", start, end); + + /* don't scan zero length buffer */ + if (cur_start != cur_end) { + const u8 * scanned = cur_buf; + char rv; + + if (mode == NO_MATCHES) { + runShengNm(sh, q->cb, q->context, q->offset, + &cached_accept_state, &cached_accept_id, cur_buf, + cur_buf + cur_start, cur_buf + cur_end, can_die, + has_accel, single, &scanned, &state); + } else if (mode == CALLBACK_OUTPUT) { + rv = runShengCb(sh, q->cb, q->context, q->offset, + &cached_accept_state, &cached_accept_id, + cur_buf, cur_buf + cur_start, cur_buf + cur_end, + can_die, has_accel, single, &scanned, &state); + if (rv == MO_DEAD) { + DEBUG_PRINTF("exiting in state %u\n", + state & SHENG_STATE_MASK); + return MO_DEAD; + } + } else if (mode == STOP_AT_MATCH) { + rv = runShengSam(sh, q->cb, q->context, q->offset, + &cached_accept_state, &cached_accept_id, + cur_buf, cur_buf + cur_start, + cur_buf + cur_end, can_die, has_accel, single, + &scanned, &state); + if (rv == MO_DEAD) { + DEBUG_PRINTF("exiting in state %u\n", + state & SHENG_STATE_MASK); + return rv; + } else if (rv == MO_MATCHES_PENDING) { + assert(q->cur); + DEBUG_PRINTF("found a match, setting q location to %zd\n", + scanned - cur_buf + 1); + q->cur--; + q->items[q->cur].type = MQE_START; + q->items[q->cur].location = + scanned - cur_buf + 1; /* due to exiting early */ + *(u8 *)q->state = state; + DEBUG_PRINTF("exiting in state %u\n", + state & SHENG_STATE_MASK); + return rv; + } + } else { + assert(!"invalid scanning mode!"); + } + assert(scanned == cur_buf + cur_end); + + cur_start = cur_end; + } + + /* if we our queue event is past our end */ + if (mode != NO_MATCHES && q_cur_loc(q) > b_end) { + DEBUG_PRINTF("current location past buffer end\n"); + DEBUG_PRINTF("setting q location to %llu\n", b_end); + DEBUG_PRINTF("exiting in state %u\n", state & SHENG_STATE_MASK); + q->cur--; + q->items[q->cur].type = MQE_START; + q->items[q->cur].location = b_end; + *(u8 *)q->state = state; + return MO_ALIVE; + } + + /* crossing over into actual buffer */ + if (cur_start == 0) { + DEBUG_PRINTF("positive location, scanning buffer\n"); + DEBUG_PRINTF("max offset: %lli\n", b_end); + cur_buf = q->buffer; + } + + /* continue scanning the same buffer */ + if (end != cur_end) { + continue; + } + + switch (q_cur_type(q)) { + case MQE_END: + *(u8 *)q->state = state; + q->cur++; + DEBUG_PRINTF("exiting in state %u\n", state & SHENG_STATE_MASK); + if (can_die) { + return (state & SHENG_STATE_DEAD) ? MO_DEAD : MO_ALIVE; + } + return MO_ALIVE; + case MQE_TOP: + if (q->offset + cur_start == 0) { + DEBUG_PRINTF("Anchored start, going to state %u\n", + sh->anchored); + state = sh->anchored; + } else { + u8 new_state = get_aux(sh, state)->top; + DEBUG_PRINTF("Top event %u->%u\n", state & SHENG_STATE_MASK, + new_state & SHENG_STATE_MASK); + state = new_state; + } + break; + default: + assert(!"invalid queue event"); + break; + } + q->cur++; + } +} + +char nfaExecSheng_B(const struct NFA *n, u64a offset, const u8 *buffer, + size_t length, NfaCallback cb, void *context) { + DEBUG_PRINTF("smallwrite Sheng\n"); + assert(n->type == SHENG_NFA); + const struct sheng *sh = getImplNfa(n); + u8 state = sh->anchored; + u8 can_die = sh->flags & SHENG_FLAG_CAN_DIE; + u8 has_accel = sh->flags & SHENG_FLAG_HAS_ACCEL; + u8 single = sh->flags & SHENG_FLAG_SINGLE_REPORT; + u8 cached_accept_state = 0; + ReportID cached_accept_id = 0; + + /* scan and report all matches */ + int rv; + s64a end = length; + const u8 *scanned; + + rv = runShengCb(sh, cb, context, offset, &cached_accept_state, + &cached_accept_id, buffer, buffer, buffer + end, can_die, + has_accel, single, &scanned, &state); + if (rv == MO_DEAD) { + DEBUG_PRINTF("exiting in state %u\n", + state & SHENG_STATE_MASK); + return MO_DEAD; + } + + DEBUG_PRINTF("%u\n", state & SHENG_STATE_MASK); + + const struct sstate_aux *aux = get_aux(sh, state); + + if (aux->accept_eod) { + DEBUG_PRINTF("Reporting EOD matches\n"); + fireReports(sh, cb, context, state, end + offset, &cached_accept_state, + &cached_accept_id, 1); + } + + return state & SHENG_STATE_DEAD ? MO_DEAD : MO_ALIVE; +} + +char nfaExecSheng_Q(const struct NFA *n, struct mq *q, s64a end) { + const struct sheng *sh = get_sheng(n); + char rv = runSheng(sh, q, end, CALLBACK_OUTPUT); + return rv; +} + +char nfaExecSheng_Q2(const struct NFA *n, struct mq *q, s64a end) { + const struct sheng *sh = get_sheng(n); + char rv = runSheng(sh, q, end, STOP_AT_MATCH); + return rv; +} + +char nfaExecSheng_QR(const struct NFA *n, struct mq *q, ReportID report) { + assert(q_cur_type(q) == MQE_START); + + const struct sheng *sh = get_sheng(n); + char rv = runSheng(sh, q, 0 /* end */, NO_MATCHES); + + if (rv && nfaExecSheng_inAccept(n, report, q)) { + return MO_MATCHES_PENDING; + } + return rv; +} + +char nfaExecSheng_inAccept(const struct NFA *n, ReportID report, struct mq *q) { + assert(n && q); + + const struct sheng *sh = get_sheng(n); + u8 s = *(const u8 *)q->state; + DEBUG_PRINTF("checking accepts for %u\n", (u8)(s & SHENG_STATE_MASK)); + + const struct sstate_aux *aux = get_aux(sh, s); + + if (!aux->accept) { + return 0; + } + + return shengHasAccept(sh, aux, report); +} + +char nfaExecSheng_inAnyAccept(const struct NFA *n, struct mq *q) { + assert(n && q); + + const struct sheng *sh = get_sheng(n); + u8 s = *(const u8 *)q->state; + DEBUG_PRINTF("checking accepts for %u\n", (u8)(s & SHENG_STATE_MASK)); + + const struct sstate_aux *aux = get_aux(sh, s); + return !!aux->accept; +} + +char nfaExecSheng_testEOD(const struct NFA *nfa, const char *state, + UNUSED const char *streamState, u64a offset, + NfaCallback cb, void *ctxt) { + assert(nfa); + + const struct sheng *sh = get_sheng(nfa); + u8 s = *(const u8 *)state; + DEBUG_PRINTF("checking EOD accepts for %u\n", (u8)(s & SHENG_STATE_MASK)); + + const struct sstate_aux *aux = get_aux(sh, s); + + if (!aux->accept_eod) { + return MO_CONTINUE_MATCHING; + } + + return fireReports(sh, cb, ctxt, s, offset, NULL, NULL, 1); +} + +char nfaExecSheng_reportCurrent(const struct NFA *n, struct mq *q) { + const struct sheng *sh = (const struct sheng *)getImplNfa(n); + NfaCallback cb = q->cb; + void *ctxt = q->context; + u8 s = *(u8 *)q->state; + const struct sstate_aux *aux = get_aux(sh, s); + u64a offset = q_cur_offset(q); + u8 cached_state_id = 0; + ReportID cached_report_id = 0; + assert(q_cur_type(q) == MQE_START); + + if (aux->accept) { + if (sh->flags & SHENG_FLAG_SINGLE_REPORT) { + fireSingleReport(cb, ctxt, sh->report, offset); + } else { + fireReports(sh, cb, ctxt, s, offset, &cached_state_id, + &cached_report_id, 0); + } + } + + return 0; +} + +char nfaExecSheng_initCompressedState(const struct NFA *nfa, u64a offset, + void *state, UNUSED u8 key) { + const struct sheng *sh = get_sheng(nfa); + u8 *s = (u8 *)state; + *s = offset ? sh->floating: sh->anchored; + return !(*s & SHENG_STATE_DEAD); +} + +char nfaExecSheng_queueInitState(const struct NFA *nfa, struct mq *q) { + assert(nfa->scratchStateSize == 1); + + /* starting in floating state */ + const struct sheng *sh = get_sheng(nfa); + *(u8 *)q->state = sh->floating; + DEBUG_PRINTF("starting in floating state\n"); + return 0; +} + +char nfaExecSheng_queueCompressState(UNUSED const struct NFA *nfa, + const struct mq *q, UNUSED s64a loc) { + void *dest = q->streamState; + const void *src = q->state; + assert(nfa->scratchStateSize == 1); + assert(nfa->streamStateSize == 1); + *(u8 *)dest = *(const u8 *)src; + return 0; +} + +char nfaExecSheng_expandState(UNUSED const struct NFA *nfa, void *dest, + const void *src, UNUSED u64a offset, + UNUSED u8 key) { + assert(nfa->scratchStateSize == 1); + assert(nfa->streamStateSize == 1); + *(u8 *)dest = *(const u8 *)src; + return 0; +} + +#if defined(HAVE_AVX512VBMI) +// Sheng32 +static really_inline +char runSheng32Cb(const struct sheng32 *sh, NfaCallback cb, void *ctxt, + u64a offset, u8 *const cached_accept_state, + ReportID *const cached_accept_id, const u8 *cur_buf, + const u8 *start, const u8 *end, u8 can_die, + u8 has_accel, u8 single, const u8 **scanned, u8 *state) { + DEBUG_PRINTF("Scanning %llu bytes (offset %llu) in callback mode\n", + (u64a)(end - start), offset); + DEBUG_PRINTF("start: %lli end: %lli\n", (s64a)(start - cur_buf), + (s64a)(end - cur_buf)); + DEBUG_PRINTF("can die: %u has accel: %u single: %u\n", !!can_die, + !!has_accel, !!single); + int rv; + /* scan and report all matches */ + if (can_die) { + if (has_accel) { + rv = sheng32_4_coda(state, cb, ctxt, sh, cached_accept_state, + cached_accept_id, single, offset, cur_buf, + start, end, scanned); + } else { + rv = sheng32_4_cod(state, cb, ctxt, sh, cached_accept_state, + cached_accept_id, single, offset, cur_buf, + start, end, scanned); + } + if (rv == MO_HALT_MATCHING) { + return MO_DEAD; + } + rv = sheng32_cod(state, cb, ctxt, sh, cached_accept_state, + cached_accept_id, single, offset, cur_buf, + *scanned, end, scanned); + } else { + if (has_accel) { + rv = sheng32_4_coa(state, cb, ctxt, sh, cached_accept_state, + cached_accept_id, single, offset, cur_buf, + start, end, scanned); + } else { + rv = sheng32_4_co(state, cb, ctxt, sh, cached_accept_state, + cached_accept_id, single, offset, cur_buf, + start, end, scanned); + } + if (rv == MO_HALT_MATCHING) { + return MO_DEAD; + } + rv = sheng32_co(state, cb, ctxt, sh, cached_accept_state, + cached_accept_id, single, offset, cur_buf, + *scanned, end, scanned); + } + if (rv == MO_HALT_MATCHING) { + return MO_DEAD; + } + return MO_ALIVE; +} + +static really_inline +void runSheng32Nm(const struct sheng32 *sh, NfaCallback cb, void *ctxt, + u64a offset, u8 *const cached_accept_state, + ReportID *const cached_accept_id, const u8 *cur_buf, + const u8 *start, const u8 *end, u8 can_die, u8 has_accel, + u8 single, const u8 **scanned, u8 *state) { + DEBUG_PRINTF("Scanning %llu bytes (offset %llu) in nomatch mode\n", + (u64a)(end - start), offset); + DEBUG_PRINTF("start: %lli end: %lli\n", (s64a)(start - cur_buf), + (s64a)(end - cur_buf)); + DEBUG_PRINTF("can die: %u has accel: %u single: %u\n", !!can_die, + !!has_accel, !!single); + /* just scan the buffer */ + if (can_die) { + if (has_accel) { + sheng32_4_nmda(state, cb, ctxt, sh, cached_accept_state, + cached_accept_id, single, offset, cur_buf, + start, end, scanned); + } else { + sheng32_4_nmd(state, cb, ctxt, sh, cached_accept_state, + cached_accept_id, single, offset, cur_buf, + start, end, scanned); + } + sheng32_nmd(state, cb, ctxt, sh, cached_accept_state, cached_accept_id, + single, offset, cur_buf, *scanned, end, scanned); + } else { + sheng32_4_nm(state, cb, ctxt, sh, cached_accept_state, cached_accept_id, + single, offset, cur_buf, start, end, scanned); + sheng32_nm(state, cb, ctxt, sh, cached_accept_state, cached_accept_id, + single, offset, cur_buf, *scanned, end, scanned); + } +} + +static really_inline +char runSheng32Sam(const struct sheng32 *sh, NfaCallback cb, void *ctxt, + u64a offset, u8 *const cached_accept_state, + ReportID *const cached_accept_id, const u8 *cur_buf, + const u8 *start, const u8 *end, u8 can_die, u8 has_accel, + u8 single, const u8 **scanned, u8 *state) { + DEBUG_PRINTF("Scanning %llu bytes (offset %llu) in stop at match mode\n", + (u64a)(end - start), offset); + DEBUG_PRINTF("start: %lli end: %lli\n", (s64a)(start - cur_buf), + (s64a)(end - cur_buf)); + DEBUG_PRINTF("can die: %u has accel: %u single: %u\n", !!can_die, + !!has_accel, !!single); + int rv; + /* scan until first match */ + if (can_die) { + if (has_accel) { + rv = sheng32_4_samda(state, cb, ctxt, sh, cached_accept_state, + cached_accept_id, single, offset, cur_buf, + start, end, scanned); + } else { + rv = sheng32_4_samd(state, cb, ctxt, sh, cached_accept_state, + cached_accept_id, single, offset, cur_buf, + start, end, scanned); + } + if (rv == MO_HALT_MATCHING) { + return MO_DEAD; + } + /* if we stopped before we expected, we found a match */ + if (rv == MO_MATCHES_PENDING) { + return MO_MATCHES_PENDING; + } + + rv = sheng32_samd(state, cb, ctxt, sh, cached_accept_state, + cached_accept_id, single, offset, cur_buf, + *scanned, end, scanned); + } else { + if (has_accel) { + rv = sheng32_4_sama(state, cb, ctxt, sh, cached_accept_state, + cached_accept_id, single, offset, cur_buf, + start, end, scanned); + } else { + rv = sheng32_4_sam(state, cb, ctxt, sh, cached_accept_state, + cached_accept_id, single, offset, cur_buf, + start, end, scanned); + } + if (rv == MO_HALT_MATCHING) { + return MO_DEAD; + } + /* if we stopped before we expected, we found a match */ + if (rv == MO_MATCHES_PENDING) { + return MO_MATCHES_PENDING; + } + + rv = sheng32_sam(state, cb, ctxt, sh, cached_accept_state, + cached_accept_id, single, offset, cur_buf, + *scanned, end, scanned); + } + if (rv == MO_HALT_MATCHING) { + return MO_DEAD; + } + /* if we stopped before we expected, we found a match */ + if (rv == MO_MATCHES_PENDING) { + return MO_MATCHES_PENDING; + } + return MO_ALIVE; +} + +static never_inline +char runSheng32(const struct sheng32 *sh, struct mq *q, s64a b_end, + enum MatchMode mode) { + u8 state = *(u8 *)q->state; + u8 can_die = sh->flags & SHENG_FLAG_CAN_DIE; + u8 has_accel = sh->flags & SHENG_FLAG_HAS_ACCEL; + u8 single = sh->flags & SHENG_FLAG_SINGLE_REPORT; + + u8 cached_accept_state = 0; + ReportID cached_accept_id = 0; + + DEBUG_PRINTF("starting Sheng32 execution in state %u\n", + state & SHENG32_STATE_MASK); + + if (q->report_current) { + DEBUG_PRINTF("reporting current pending matches\n"); + assert(sh); + + q->report_current = 0; + + int rv; + if (single) { + rv = fireSingleReport(q->cb, q->context, sh->report, + q_cur_offset(q)); + } else { + rv = fireReports32(sh, q->cb, q->context, state, q_cur_offset(q), + &cached_accept_state, &cached_accept_id, 0); + } + if (rv == MO_HALT_MATCHING) { + DEBUG_PRINTF("exiting in state %u\n", state & SHENG32_STATE_MASK); + return MO_DEAD; + } + + DEBUG_PRINTF("proceeding with matching\n"); + } + + assert(q_cur_type(q) == MQE_START); + s64a start = q_cur_loc(q); + + DEBUG_PRINTF("offset: %lli, location: %lli, mode: %s\n", q->offset, start, + mode == CALLBACK_OUTPUT ? "CALLBACK OUTPUT" : + mode == NO_MATCHES ? "NO MATCHES" : + mode == STOP_AT_MATCH ? "STOP AT MATCH" : "???"); + + DEBUG_PRINTF("processing event @ %lli: %s\n", q->offset + q_cur_loc(q), + q_cur_type(q) == MQE_START ? "START" : + q_cur_type(q) == MQE_TOP ? "TOP" : + q_cur_type(q) == MQE_END ? "END" : "???"); + + const u8* cur_buf; + if (start < 0) { + DEBUG_PRINTF("negative location, scanning history\n"); + DEBUG_PRINTF("min location: %zd\n", -q->hlength); + cur_buf = q->history + q->hlength; + } else { + DEBUG_PRINTF("positive location, scanning buffer\n"); + DEBUG_PRINTF("max location: %lli\n", b_end); + cur_buf = q->buffer; + } + + /* if we our queue event is past our end */ + if (mode != NO_MATCHES && q_cur_loc(q) > b_end) { + DEBUG_PRINTF("current location past buffer end\n"); + DEBUG_PRINTF("setting q location to %llu\n", b_end); + DEBUG_PRINTF("exiting in state %u\n", state & SHENG32_STATE_MASK); + q->items[q->cur].location = b_end; + return MO_ALIVE; + } + + q->cur++; + + s64a cur_start = start; + + while (1) { + DEBUG_PRINTF("processing event @ %lli: %s\n", q->offset + q_cur_loc(q), + q_cur_type(q) == MQE_START ? "START" : + q_cur_type(q) == MQE_TOP ? "TOP" : + q_cur_type(q) == MQE_END ? "END" : "???"); + s64a end = q_cur_loc(q); + if (mode != NO_MATCHES) { + end = MIN(end, b_end); + } + assert(end <= (s64a) q->length); + s64a cur_end = end; + + /* we may cross the border between history and current buffer */ + if (cur_start < 0) { + cur_end = MIN(0, cur_end); + } + + DEBUG_PRINTF("start: %lli end: %lli\n", start, end); + + /* don't scan zero length buffer */ + if (cur_start != cur_end) { + const u8 * scanned = cur_buf; + char rv; + + if (mode == NO_MATCHES) { + runSheng32Nm(sh, q->cb, q->context, q->offset, + &cached_accept_state, &cached_accept_id, cur_buf, + cur_buf + cur_start, cur_buf + cur_end, can_die, + has_accel, single, &scanned, &state); + } else if (mode == CALLBACK_OUTPUT) { + rv = runSheng32Cb(sh, q->cb, q->context, q->offset, + &cached_accept_state, &cached_accept_id, + cur_buf, cur_buf + cur_start, cur_buf + cur_end, + can_die, has_accel, single, &scanned, &state); + if (rv == MO_DEAD) { + DEBUG_PRINTF("exiting in state %u\n", + state & SHENG32_STATE_MASK); + return MO_DEAD; + } + } else if (mode == STOP_AT_MATCH) { + rv = runSheng32Sam(sh, q->cb, q->context, q->offset, + &cached_accept_state, &cached_accept_id, + cur_buf, cur_buf + cur_start, + cur_buf + cur_end, can_die, has_accel, single, + &scanned, &state); + if (rv == MO_DEAD) { + DEBUG_PRINTF("exiting in state %u\n", + state & SHENG32_STATE_MASK); + return rv; + } else if (rv == MO_MATCHES_PENDING) { + assert(q->cur); + DEBUG_PRINTF("found a match, setting q location to %zd\n", + scanned - cur_buf + 1); + q->cur--; + q->items[q->cur].type = MQE_START; + q->items[q->cur].location = + scanned - cur_buf + 1; /* due to exiting early */ + *(u8 *)q->state = state; + DEBUG_PRINTF("exiting in state %u\n", + state & SHENG32_STATE_MASK); + return rv; + } + } else { + assert(!"invalid scanning mode!"); + } + assert(scanned == cur_buf + cur_end); + + cur_start = cur_end; + } + + /* if we our queue event is past our end */ + if (mode != NO_MATCHES && q_cur_loc(q) > b_end) { + DEBUG_PRINTF("current location past buffer end\n"); + DEBUG_PRINTF("setting q location to %llu\n", b_end); + DEBUG_PRINTF("exiting in state %u\n", state & SHENG32_STATE_MASK); + q->cur--; + q->items[q->cur].type = MQE_START; + q->items[q->cur].location = b_end; + *(u8 *)q->state = state; + return MO_ALIVE; + } + + /* crossing over into actual buffer */ + if (cur_start == 0) { + DEBUG_PRINTF("positive location, scanning buffer\n"); + DEBUG_PRINTF("max offset: %lli\n", b_end); + cur_buf = q->buffer; + } + + /* continue scanning the same buffer */ + if (end != cur_end) { + continue; + } + + switch (q_cur_type(q)) { + case MQE_END: + *(u8 *)q->state = state; + q->cur++; + DEBUG_PRINTF("exiting in state %u\n", state & SHENG32_STATE_MASK); + if (can_die) { + return (state & SHENG32_STATE_DEAD) ? MO_DEAD : MO_ALIVE; + } + return MO_ALIVE; + case MQE_TOP: + if (q->offset + cur_start == 0) { + DEBUG_PRINTF("Anchored start, going to state %u\n", + sh->anchored); + state = sh->anchored; + } else { + u8 new_state = get_aux32(sh, state)->top; + DEBUG_PRINTF("Top event %u->%u\n", state & SHENG32_STATE_MASK, + new_state & SHENG32_STATE_MASK); + state = new_state; + } + break; + default: + assert(!"invalid queue event"); + break; + } + q->cur++; + } +} + +char nfaExecSheng32_B(const struct NFA *n, u64a offset, const u8 *buffer, + size_t length, NfaCallback cb, void *context) { + DEBUG_PRINTF("smallwrite Sheng32\n"); + assert(n->type == SHENG_NFA_32); + const struct sheng32 *sh = getImplNfa(n); + u8 state = sh->anchored; + u8 can_die = sh->flags & SHENG_FLAG_CAN_DIE; + u8 has_accel = sh->flags & SHENG_FLAG_HAS_ACCEL; + u8 single = sh->flags & SHENG_FLAG_SINGLE_REPORT; + u8 cached_accept_state = 0; + ReportID cached_accept_id = 0; + + /* scan and report all matches */ + int rv; + s64a end = length; + const u8 *scanned; + + rv = runSheng32Cb(sh, cb, context, offset, &cached_accept_state, + &cached_accept_id, buffer, buffer, buffer + end, can_die, + has_accel, single, &scanned, &state); + if (rv == MO_DEAD) { + DEBUG_PRINTF("exiting in state %u\n", + state & SHENG32_STATE_MASK); + return MO_DEAD; + } + + DEBUG_PRINTF("%u\n", state & SHENG32_STATE_MASK); + + const struct sstate_aux *aux = get_aux32(sh, state); + + if (aux->accept_eod) { + DEBUG_PRINTF("Reporting EOD matches\n"); + fireReports32(sh, cb, context, state, end + offset, + &cached_accept_state, &cached_accept_id, 1); + } + + return state & SHENG32_STATE_DEAD ? MO_DEAD : MO_ALIVE; +} + +char nfaExecSheng32_Q(const struct NFA *n, struct mq *q, s64a end) { + const struct sheng32 *sh = get_sheng32(n); + char rv = runSheng32(sh, q, end, CALLBACK_OUTPUT); + return rv; +} + +char nfaExecSheng32_Q2(const struct NFA *n, struct mq *q, s64a end) { + const struct sheng32 *sh = get_sheng32(n); + char rv = runSheng32(sh, q, end, STOP_AT_MATCH); + return rv; +} + +char nfaExecSheng32_QR(const struct NFA *n, struct mq *q, ReportID report) { + assert(q_cur_type(q) == MQE_START); + + const struct sheng32 *sh = get_sheng32(n); + char rv = runSheng32(sh, q, 0 /* end */, NO_MATCHES); + + if (rv && nfaExecSheng32_inAccept(n, report, q)) { + return MO_MATCHES_PENDING; + } + return rv; +} + +char nfaExecSheng32_inAccept(const struct NFA *n, ReportID report, + struct mq *q) { + assert(n && q); + + const struct sheng32 *sh = get_sheng32(n); + u8 s = *(const u8 *)q->state; + DEBUG_PRINTF("checking accepts for %u\n", (u8)(s & SHENG32_STATE_MASK)); + + const struct sstate_aux *aux = get_aux32(sh, s); + + if (!aux->accept) { + return 0; + } + + return sheng32HasAccept(sh, aux, report); +} + +char nfaExecSheng32_inAnyAccept(const struct NFA *n, struct mq *q) { + assert(n && q); + + const struct sheng32 *sh = get_sheng32(n); + u8 s = *(const u8 *)q->state; + DEBUG_PRINTF("checking accepts for %u\n", (u8)(s & SHENG32_STATE_MASK)); + + const struct sstate_aux *aux = get_aux32(sh, s); + return !!aux->accept; +} + +char nfaExecSheng32_testEOD(const struct NFA *nfa, const char *state, + UNUSED const char *streamState, u64a offset, + NfaCallback cb, void *ctxt) { + assert(nfa); + + const struct sheng32 *sh = get_sheng32(nfa); + u8 s = *(const u8 *)state; + DEBUG_PRINTF("checking EOD accepts for %u\n", (u8)(s & SHENG32_STATE_MASK)); + + const struct sstate_aux *aux = get_aux32(sh, s); + + if (!aux->accept_eod) { + return MO_CONTINUE_MATCHING; + } + + return fireReports32(sh, cb, ctxt, s, offset, NULL, NULL, 1); +} + +char nfaExecSheng32_reportCurrent(const struct NFA *n, struct mq *q) { + const struct sheng32 *sh = (const struct sheng32 *)getImplNfa(n); + NfaCallback cb = q->cb; + void *ctxt = q->context; + u8 s = *(u8 *)q->state; + const struct sstate_aux *aux = get_aux32(sh, s); + u64a offset = q_cur_offset(q); + u8 cached_state_id = 0; + ReportID cached_report_id = 0; + assert(q_cur_type(q) == MQE_START); + + if (aux->accept) { + if (sh->flags & SHENG_FLAG_SINGLE_REPORT) { + fireSingleReport(cb, ctxt, sh->report, offset); + } else { + fireReports32(sh, cb, ctxt, s, offset, &cached_state_id, + &cached_report_id, 0); + } + } + + return 0; +} + +char nfaExecSheng32_initCompressedState(const struct NFA *nfa, u64a offset, + void *state, UNUSED u8 key) { + const struct sheng32 *sh = get_sheng32(nfa); + u8 *s = (u8 *)state; + *s = offset ? sh->floating: sh->anchored; + return !(*s & SHENG32_STATE_DEAD); +} + +char nfaExecSheng32_queueInitState(const struct NFA *nfa, struct mq *q) { + assert(nfa->scratchStateSize == 1); + + /* starting in floating state */ + const struct sheng32 *sh = get_sheng32(nfa); + *(u8 *)q->state = sh->floating; + DEBUG_PRINTF("starting in floating state\n"); + return 0; +} + +char nfaExecSheng32_queueCompressState(UNUSED const struct NFA *nfa, + const struct mq *q, UNUSED s64a loc) { + void *dest = q->streamState; + const void *src = q->state; + assert(nfa->scratchStateSize == 1); + assert(nfa->streamStateSize == 1); + *(u8 *)dest = *(const u8 *)src; + return 0; +} + +char nfaExecSheng32_expandState(UNUSED const struct NFA *nfa, void *dest, + const void *src, UNUSED u64a offset, + UNUSED u8 key) { + assert(nfa->scratchStateSize == 1); + assert(nfa->streamStateSize == 1); + *(u8 *)dest = *(const u8 *)src; + return 0; +} + +// Sheng64 +static really_inline +char runSheng64Cb(const struct sheng64 *sh, NfaCallback cb, void *ctxt, + u64a offset, u8 *const cached_accept_state, + ReportID *const cached_accept_id, const u8 *cur_buf, + const u8 *start, const u8 *end, u8 can_die, + u8 single, const u8 **scanned, u8 *state) { + DEBUG_PRINTF("Scanning %llu bytes (offset %llu) in callback mode\n", + (u64a)(end - start), offset); + DEBUG_PRINTF("start: %lli end: %lli\n", (s64a)(start - cur_buf), + (s64a)(end - cur_buf)); + DEBUG_PRINTF("can die: %u single: %u\n", !!can_die, !!single); + int rv; + /* scan and report all matches */ + if (can_die) { + rv = sheng64_4_cod(state, cb, ctxt, sh, cached_accept_state, + cached_accept_id, single, offset, cur_buf, + start, end, scanned); + if (rv == MO_HALT_MATCHING) { + return MO_DEAD; + } + rv = sheng64_cod(state, cb, ctxt, sh, cached_accept_state, + cached_accept_id, single, offset, cur_buf, + *scanned, end, scanned); + } else { + rv = sheng64_4_co(state, cb, ctxt, sh, cached_accept_state, + cached_accept_id, single, offset, cur_buf, + start, end, scanned); + if (rv == MO_HALT_MATCHING) { + return MO_DEAD; + } + rv = sheng64_co(state, cb, ctxt, sh, cached_accept_state, + cached_accept_id, single, offset, cur_buf, + *scanned, end, scanned); + } + if (rv == MO_HALT_MATCHING) { + return MO_DEAD; + } + return MO_ALIVE; +} + +static really_inline +void runSheng64Nm(const struct sheng64 *sh, NfaCallback cb, void *ctxt, + u64a offset, u8 *const cached_accept_state, + ReportID *const cached_accept_id, const u8 *cur_buf, + const u8 *start, const u8 *end, u8 can_die, + u8 single, const u8 **scanned, u8 *state) { + DEBUG_PRINTF("Scanning %llu bytes (offset %llu) in nomatch mode\n", + (u64a)(end - start), offset); + DEBUG_PRINTF("start: %lli end: %lli\n", (s64a)(start - cur_buf), + (s64a)(end - cur_buf)); + DEBUG_PRINTF("can die: %u single: %u\n", !!can_die, !!single); + /* just scan the buffer */ + if (can_die) { + sheng64_4_nmd(state, cb, ctxt, sh, cached_accept_state, + cached_accept_id, single, offset, cur_buf, + start, end, scanned); + sheng64_nmd(state, cb, ctxt, sh, cached_accept_state, cached_accept_id, + single, offset, cur_buf, *scanned, end, scanned); + } else { + sheng64_4_nm(state, cb, ctxt, sh, cached_accept_state, cached_accept_id, + single, offset, cur_buf, start, end, scanned); + sheng64_nm(state, cb, ctxt, sh, cached_accept_state, cached_accept_id, + single, offset, cur_buf, *scanned, end, scanned); + } +} + +static really_inline +char runSheng64Sam(const struct sheng64 *sh, NfaCallback cb, void *ctxt, + u64a offset, u8 *const cached_accept_state, + ReportID *const cached_accept_id, const u8 *cur_buf, + const u8 *start, const u8 *end, u8 can_die, + u8 single, const u8 **scanned, u8 *state) { + DEBUG_PRINTF("Scanning %llu bytes (offset %llu) in stop at match mode\n", + (u64a)(end - start), offset); + DEBUG_PRINTF("start: %lli end: %lli\n", (s64a)(start - cur_buf), + (s64a)(end - cur_buf)); + DEBUG_PRINTF("can die: %u single: %u\n", !!can_die, !!single); + int rv; + /* scan until first match */ + if (can_die) { + rv = sheng64_4_samd(state, cb, ctxt, sh, cached_accept_state, + cached_accept_id, single, offset, cur_buf, + start, end, scanned); + if (rv == MO_HALT_MATCHING) { + return MO_DEAD; + } + /* if we stopped before we expected, we found a match */ + if (rv == MO_MATCHES_PENDING) { + return MO_MATCHES_PENDING; + } + + rv = sheng64_samd(state, cb, ctxt, sh, cached_accept_state, + cached_accept_id, single, offset, cur_buf, + *scanned, end, scanned); + } else { + rv = sheng64_4_sam(state, cb, ctxt, sh, cached_accept_state, + cached_accept_id, single, offset, cur_buf, + start, end, scanned); + if (rv == MO_HALT_MATCHING) { + return MO_DEAD; + } + /* if we stopped before we expected, we found a match */ + if (rv == MO_MATCHES_PENDING) { + return MO_MATCHES_PENDING; + } + + rv = sheng64_sam(state, cb, ctxt, sh, cached_accept_state, + cached_accept_id, single, offset, cur_buf, + *scanned, end, scanned); + } + if (rv == MO_HALT_MATCHING) { + return MO_DEAD; + } + /* if we stopped before we expected, we found a match */ + if (rv == MO_MATCHES_PENDING) { + return MO_MATCHES_PENDING; + } + return MO_ALIVE; +} + +static never_inline +char runSheng64(const struct sheng64 *sh, struct mq *q, s64a b_end, + enum MatchMode mode) { + u8 state = *(u8 *)q->state; + u8 can_die = sh->flags & SHENG_FLAG_CAN_DIE; + u8 single = sh->flags & SHENG_FLAG_SINGLE_REPORT; + + u8 cached_accept_state = 0; + ReportID cached_accept_id = 0; + + DEBUG_PRINTF("starting Sheng64 execution in state %u\n", + state & SHENG64_STATE_MASK); + + if (q->report_current) { + DEBUG_PRINTF("reporting current pending matches\n"); + assert(sh); + + q->report_current = 0; + + int rv; + if (single) { + rv = fireSingleReport(q->cb, q->context, sh->report, + q_cur_offset(q)); + } else { + rv = fireReports64(sh, q->cb, q->context, state, q_cur_offset(q), + &cached_accept_state, &cached_accept_id, 0); + } + if (rv == MO_HALT_MATCHING) { + DEBUG_PRINTF("exiting in state %u\n", state & SHENG64_STATE_MASK); + return MO_DEAD; + } + + DEBUG_PRINTF("proceeding with matching\n"); + } + + assert(q_cur_type(q) == MQE_START); + s64a start = q_cur_loc(q); + + DEBUG_PRINTF("offset: %lli, location: %lli, mode: %s\n", q->offset, start, + mode == CALLBACK_OUTPUT ? "CALLBACK OUTPUT" : + mode == NO_MATCHES ? "NO MATCHES" : + mode == STOP_AT_MATCH ? "STOP AT MATCH" : "???"); + + DEBUG_PRINTF("processing event @ %lli: %s\n", q->offset + q_cur_loc(q), + q_cur_type(q) == MQE_START ? "START" : + q_cur_type(q) == MQE_TOP ? "TOP" : + q_cur_type(q) == MQE_END ? "END" : "???"); + + const u8* cur_buf; + if (start < 0) { + DEBUG_PRINTF("negative location, scanning history\n"); + DEBUG_PRINTF("min location: %zd\n", -q->hlength); + cur_buf = q->history + q->hlength; + } else { + DEBUG_PRINTF("positive location, scanning buffer\n"); + DEBUG_PRINTF("max location: %lli\n", b_end); + cur_buf = q->buffer; + } + + /* if we our queue event is past our end */ + if (mode != NO_MATCHES && q_cur_loc(q) > b_end) { + DEBUG_PRINTF("current location past buffer end\n"); + DEBUG_PRINTF("setting q location to %llu\n", b_end); + DEBUG_PRINTF("exiting in state %u\n", state & SHENG64_STATE_MASK); + q->items[q->cur].location = b_end; + return MO_ALIVE; + } + + q->cur++; + + s64a cur_start = start; + + while (1) { + DEBUG_PRINTF("processing event @ %lli: %s\n", q->offset + q_cur_loc(q), + q_cur_type(q) == MQE_START ? "START" : + q_cur_type(q) == MQE_TOP ? "TOP" : + q_cur_type(q) == MQE_END ? "END" : "???"); + s64a end = q_cur_loc(q); + if (mode != NO_MATCHES) { + end = MIN(end, b_end); + } + assert(end <= (s64a) q->length); + s64a cur_end = end; + + /* we may cross the border between history and current buffer */ + if (cur_start < 0) { + cur_end = MIN(0, cur_end); + } + + DEBUG_PRINTF("start: %lli end: %lli\n", start, end); + + /* don't scan zero length buffer */ + if (cur_start != cur_end) { + const u8 * scanned = cur_buf; + char rv; + + if (mode == NO_MATCHES) { + runSheng64Nm(sh, q->cb, q->context, q->offset, + &cached_accept_state, &cached_accept_id, cur_buf, + cur_buf + cur_start, cur_buf + cur_end, can_die, + single, &scanned, &state); + } else if (mode == CALLBACK_OUTPUT) { + rv = runSheng64Cb(sh, q->cb, q->context, q->offset, + &cached_accept_state, &cached_accept_id, + cur_buf, cur_buf + cur_start, cur_buf + cur_end, + can_die, single, &scanned, &state); + if (rv == MO_DEAD) { + DEBUG_PRINTF("exiting in state %u\n", + state & SHENG64_STATE_MASK); + return MO_DEAD; + } + } else if (mode == STOP_AT_MATCH) { + rv = runSheng64Sam(sh, q->cb, q->context, q->offset, + &cached_accept_state, &cached_accept_id, + cur_buf, cur_buf + cur_start, + cur_buf + cur_end, can_die, single, + &scanned, &state); + if (rv == MO_DEAD) { + DEBUG_PRINTF("exiting in state %u\n", + state & SHENG64_STATE_MASK); + return rv; + } else if (rv == MO_MATCHES_PENDING) { + assert(q->cur); + DEBUG_PRINTF("found a match, setting q location to %zd\n", + scanned - cur_buf + 1); + q->cur--; + q->items[q->cur].type = MQE_START; + q->items[q->cur].location = + scanned - cur_buf + 1; /* due to exiting early */ + *(u8 *)q->state = state; + DEBUG_PRINTF("exiting in state %u\n", + state & SHENG64_STATE_MASK); + return rv; + } + } else { + assert(!"invalid scanning mode!"); + } + assert(scanned == cur_buf + cur_end); + + cur_start = cur_end; + } + + /* if we our queue event is past our end */ + if (mode != NO_MATCHES && q_cur_loc(q) > b_end) { + DEBUG_PRINTF("current location past buffer end\n"); + DEBUG_PRINTF("setting q location to %llu\n", b_end); + DEBUG_PRINTF("exiting in state %u\n", state & SHENG64_STATE_MASK); + q->cur--; + q->items[q->cur].type = MQE_START; + q->items[q->cur].location = b_end; + *(u8 *)q->state = state; + return MO_ALIVE; + } + + /* crossing over into actual buffer */ + if (cur_start == 0) { + DEBUG_PRINTF("positive location, scanning buffer\n"); + DEBUG_PRINTF("max offset: %lli\n", b_end); + cur_buf = q->buffer; + } + + /* continue scanning the same buffer */ + if (end != cur_end) { + continue; + } + + switch (q_cur_type(q)) { + case MQE_END: + *(u8 *)q->state = state; + q->cur++; + DEBUG_PRINTF("exiting in state %u\n", state & SHENG64_STATE_MASK); + if (can_die) { + return (state & SHENG64_STATE_DEAD) ? MO_DEAD : MO_ALIVE; + } + return MO_ALIVE; + case MQE_TOP: + if (q->offset + cur_start == 0) { + DEBUG_PRINTF("Anchored start, going to state %u\n", + sh->anchored); + state = sh->anchored; + } else { + u8 new_state = get_aux64(sh, state)->top; + DEBUG_PRINTF("Top event %u->%u\n", state & SHENG64_STATE_MASK, + new_state & SHENG64_STATE_MASK); + state = new_state; + } + break; + default: + assert(!"invalid queue event"); + break; + } + q->cur++; + } +} + +char nfaExecSheng64_B(const struct NFA *n, u64a offset, const u8 *buffer, + size_t length, NfaCallback cb, void *context) { + DEBUG_PRINTF("smallwrite Sheng64\n"); + assert(n->type == SHENG_NFA_64); + const struct sheng64 *sh = getImplNfa(n); + u8 state = sh->anchored; + u8 can_die = sh->flags & SHENG_FLAG_CAN_DIE; + u8 single = sh->flags & SHENG_FLAG_SINGLE_REPORT; + u8 cached_accept_state = 0; + ReportID cached_accept_id = 0; + + /* scan and report all matches */ + int rv; + s64a end = length; + const u8 *scanned; + + rv = runSheng64Cb(sh, cb, context, offset, &cached_accept_state, + &cached_accept_id, buffer, buffer, buffer + end, can_die, + single, &scanned, &state); + if (rv == MO_DEAD) { + DEBUG_PRINTF("exiting in state %u\n", + state & SHENG64_STATE_MASK); + return MO_DEAD; + } + + DEBUG_PRINTF("%u\n", state & SHENG64_STATE_MASK); + + const struct sstate_aux *aux = get_aux64(sh, state); + + if (aux->accept_eod) { + DEBUG_PRINTF("Reporting EOD matches\n"); + fireReports64(sh, cb, context, state, end + offset, + &cached_accept_state, &cached_accept_id, 1); + } + + return state & SHENG64_STATE_DEAD ? MO_DEAD : MO_ALIVE; +} + +char nfaExecSheng64_Q(const struct NFA *n, struct mq *q, s64a end) { + const struct sheng64 *sh = get_sheng64(n); + char rv = runSheng64(sh, q, end, CALLBACK_OUTPUT); + return rv; +} + +char nfaExecSheng64_Q2(const struct NFA *n, struct mq *q, s64a end) { + const struct sheng64 *sh = get_sheng64(n); + char rv = runSheng64(sh, q, end, STOP_AT_MATCH); + return rv; +} + +char nfaExecSheng64_QR(const struct NFA *n, struct mq *q, ReportID report) { + assert(q_cur_type(q) == MQE_START); + + const struct sheng64 *sh = get_sheng64(n); + char rv = runSheng64(sh, q, 0 /* end */, NO_MATCHES); + + if (rv && nfaExecSheng64_inAccept(n, report, q)) { + return MO_MATCHES_PENDING; + } + return rv; +} + +char nfaExecSheng64_inAccept(const struct NFA *n, ReportID report, + struct mq *q) { + assert(n && q); + + const struct sheng64 *sh = get_sheng64(n); + u8 s = *(const u8 *)q->state; + DEBUG_PRINTF("checking accepts for %u\n", (u8)(s & SHENG64_STATE_MASK)); + + const struct sstate_aux *aux = get_aux64(sh, s); + + if (!aux->accept) { + return 0; + } + + return sheng64HasAccept(sh, aux, report); +} + +char nfaExecSheng64_inAnyAccept(const struct NFA *n, struct mq *q) { + assert(n && q); + + const struct sheng64 *sh = get_sheng64(n); + u8 s = *(const u8 *)q->state; + DEBUG_PRINTF("checking accepts for %u\n", (u8)(s & SHENG64_STATE_MASK)); + + const struct sstate_aux *aux = get_aux64(sh, s); + return !!aux->accept; +} + +char nfaExecSheng64_testEOD(const struct NFA *nfa, const char *state, + UNUSED const char *streamState, u64a offset, + NfaCallback cb, void *ctxt) { + assert(nfa); + + const struct sheng64 *sh = get_sheng64(nfa); + u8 s = *(const u8 *)state; + DEBUG_PRINTF("checking EOD accepts for %u\n", (u8)(s & SHENG64_STATE_MASK)); + + const struct sstate_aux *aux = get_aux64(sh, s); + + if (!aux->accept_eod) { + return MO_CONTINUE_MATCHING; + } + + return fireReports64(sh, cb, ctxt, s, offset, NULL, NULL, 1); +} + +char nfaExecSheng64_reportCurrent(const struct NFA *n, struct mq *q) { + const struct sheng64 *sh = (const struct sheng64 *)getImplNfa(n); + NfaCallback cb = q->cb; + void *ctxt = q->context; + u8 s = *(u8 *)q->state; + const struct sstate_aux *aux = get_aux64(sh, s); + u64a offset = q_cur_offset(q); + u8 cached_state_id = 0; + ReportID cached_report_id = 0; + assert(q_cur_type(q) == MQE_START); + + if (aux->accept) { + if (sh->flags & SHENG_FLAG_SINGLE_REPORT) { + fireSingleReport(cb, ctxt, sh->report, offset); + } else { + fireReports64(sh, cb, ctxt, s, offset, &cached_state_id, + &cached_report_id, 0); + } + } + + return 0; +} + +char nfaExecSheng64_initCompressedState(const struct NFA *nfa, u64a offset, + void *state, UNUSED u8 key) { + const struct sheng64 *sh = get_sheng64(nfa); + u8 *s = (u8 *)state; + *s = offset ? sh->floating: sh->anchored; + return !(*s & SHENG64_STATE_DEAD); +} + +char nfaExecSheng64_queueInitState(const struct NFA *nfa, struct mq *q) { + assert(nfa->scratchStateSize == 1); + + /* starting in floating state */ + const struct sheng64 *sh = get_sheng64(nfa); + *(u8 *)q->state = sh->floating; + DEBUG_PRINTF("starting in floating state\n"); + return 0; +} + +char nfaExecSheng64_queueCompressState(UNUSED const struct NFA *nfa, + const struct mq *q, UNUSED s64a loc) { + void *dest = q->streamState; + const void *src = q->state; + assert(nfa->scratchStateSize == 1); + assert(nfa->streamStateSize == 1); + *(u8 *)dest = *(const u8 *)src; + return 0; +} + +char nfaExecSheng64_expandState(UNUSED const struct NFA *nfa, void *dest, + const void *src, UNUSED u64a offset, + UNUSED u8 key) { + assert(nfa->scratchStateSize == 1); + assert(nfa->streamStateSize == 1); + *(u8 *)dest = *(const u8 *)src; + return 0; +} +#endif // end of HAVE_AVX512VBMI diff --git a/regex/nfa/sheng.h b/regex/nfa/sheng.h new file mode 100644 index 000000000..7b90e3034 --- /dev/null +++ b/regex/nfa/sheng.h @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2016-2020, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SHENG_H_ +#define SHENG_H_ + +#include "callback.h" +#include "ue2common.h" + +struct mq; +struct NFA; + +#define nfaExecSheng_B_Reverse NFA_API_NO_IMPL +#define nfaExecSheng_zombie_status NFA_API_ZOMBIE_NO_IMPL + +char nfaExecSheng_Q(const struct NFA *n, struct mq *q, s64a end); +char nfaExecSheng_Q2(const struct NFA *n, struct mq *q, s64a end); +char nfaExecSheng_QR(const struct NFA *n, struct mq *q, ReportID report); +char nfaExecSheng_inAccept(const struct NFA *n, ReportID report, struct mq *q); +char nfaExecSheng_inAnyAccept(const struct NFA *n, struct mq *q); +char nfaExecSheng_queueInitState(const struct NFA *nfa, struct mq *q); +char nfaExecSheng_queueCompressState(const struct NFA *nfa, const struct mq *q, + s64a loc); +char nfaExecSheng_expandState(const struct NFA *nfa, void *dest, + const void *src, u64a offset, u8 key); +char nfaExecSheng_initCompressedState(const struct NFA *nfa, u64a offset, + void *state, u8 key); +char nfaExecSheng_testEOD(const struct NFA *nfa, const char *state, + const char *streamState, u64a offset, + NfaCallback callback, void *context); +char nfaExecSheng_reportCurrent(const struct NFA *n, struct mq *q); + +char nfaExecSheng_B(const struct NFA *n, u64a offset, const u8 *buffer, + size_t length, NfaCallback cb, void *context); + +#if defined(HAVE_AVX512VBMI) +#define nfaExecSheng32_B_Reverse NFA_API_NO_IMPL +#define nfaExecSheng32_zombie_status NFA_API_ZOMBIE_NO_IMPL + +char nfaExecSheng32_Q(const struct NFA *n, struct mq *q, s64a end); +char nfaExecSheng32_Q2(const struct NFA *n, struct mq *q, s64a end); +char nfaExecSheng32_QR(const struct NFA *n, struct mq *q, ReportID report); +char nfaExecSheng32_inAccept(const struct NFA *n, ReportID report, + struct mq *q); +char nfaExecSheng32_inAnyAccept(const struct NFA *n, struct mq *q); +char nfaExecSheng32_queueInitState(const struct NFA *nfa, struct mq *q); +char nfaExecSheng32_queueCompressState(const struct NFA *nfa, + const struct mq *q, s64a loc); +char nfaExecSheng32_expandState(const struct NFA *nfa, void *dest, + const void *src, u64a offset, u8 key); +char nfaExecSheng32_initCompressedState(const struct NFA *nfa, u64a offset, + void *state, u8 key); +char nfaExecSheng32_testEOD(const struct NFA *nfa, const char *state, + const char *streamState, u64a offset, + NfaCallback callback, void *context); +char nfaExecSheng32_reportCurrent(const struct NFA *n, struct mq *q); + +char nfaExecSheng32_B(const struct NFA *n, u64a offset, const u8 *buffer, + size_t length, NfaCallback cb, void *context); + +#define nfaExecSheng64_B_Reverse NFA_API_NO_IMPL +#define nfaExecSheng64_zombie_status NFA_API_ZOMBIE_NO_IMPL + +char nfaExecSheng64_Q(const struct NFA *n, struct mq *q, s64a end); +char nfaExecSheng64_Q2(const struct NFA *n, struct mq *q, s64a end); +char nfaExecSheng64_QR(const struct NFA *n, struct mq *q, ReportID report); +char nfaExecSheng64_inAccept(const struct NFA *n, ReportID report, + struct mq *q); +char nfaExecSheng64_inAnyAccept(const struct NFA *n, struct mq *q); +char nfaExecSheng64_queueInitState(const struct NFA *nfa, struct mq *q); +char nfaExecSheng64_queueCompressState(const struct NFA *nfa, + const struct mq *q, s64a loc); +char nfaExecSheng64_expandState(const struct NFA *nfa, void *dest, + const void *src, u64a offset, u8 key); +char nfaExecSheng64_initCompressedState(const struct NFA *nfa, u64a offset, + void *state, u8 key); +char nfaExecSheng64_testEOD(const struct NFA *nfa, const char *state, + const char *streamState, u64a offset, + NfaCallback callback, void *context); +char nfaExecSheng64_reportCurrent(const struct NFA *n, struct mq *q); + +char nfaExecSheng64_B(const struct NFA *n, u64a offset, const u8 *buffer, + size_t length, NfaCallback cb, void *context); + +#else // !HAVE_AVX512VBMI + +#define nfaExecSheng32_B_Reverse NFA_API_NO_IMPL +#define nfaExecSheng32_zombie_status NFA_API_ZOMBIE_NO_IMPL +#define nfaExecSheng32_Q NFA_API_NO_IMPL +#define nfaExecSheng32_Q2 NFA_API_NO_IMPL +#define nfaExecSheng32_QR NFA_API_NO_IMPL +#define nfaExecSheng32_inAccept NFA_API_NO_IMPL +#define nfaExecSheng32_inAnyAccept NFA_API_NO_IMPL +#define nfaExecSheng32_queueInitState NFA_API_NO_IMPL +#define nfaExecSheng32_queueCompressState NFA_API_NO_IMPL +#define nfaExecSheng32_expandState NFA_API_NO_IMPL +#define nfaExecSheng32_initCompressedState NFA_API_NO_IMPL +#define nfaExecSheng32_testEOD NFA_API_NO_IMPL +#define nfaExecSheng32_reportCurrent NFA_API_NO_IMPL +#define nfaExecSheng32_B NFA_API_NO_IMPL + +#define nfaExecSheng64_B_Reverse NFA_API_NO_IMPL +#define nfaExecSheng64_zombie_status NFA_API_ZOMBIE_NO_IMPL +#define nfaExecSheng64_Q NFA_API_NO_IMPL +#define nfaExecSheng64_Q2 NFA_API_NO_IMPL +#define nfaExecSheng64_QR NFA_API_NO_IMPL +#define nfaExecSheng64_inAccept NFA_API_NO_IMPL +#define nfaExecSheng64_inAnyAccept NFA_API_NO_IMPL +#define nfaExecSheng64_queueInitState NFA_API_NO_IMPL +#define nfaExecSheng64_queueCompressState NFA_API_NO_IMPL +#define nfaExecSheng64_expandState NFA_API_NO_IMPL +#define nfaExecSheng64_initCompressedState NFA_API_NO_IMPL +#define nfaExecSheng64_testEOD NFA_API_NO_IMPL +#define nfaExecSheng64_reportCurrent NFA_API_NO_IMPL +#define nfaExecSheng64_B NFA_API_NO_IMPL +#endif // end of HAVE_AVX512VBMI + +#endif /* SHENG_H_ */ diff --git a/regex/nfa/sheng_defs.h b/regex/nfa/sheng_defs.h new file mode 100644 index 000000000..390af7522 --- /dev/null +++ b/regex/nfa/sheng_defs.h @@ -0,0 +1,754 @@ +/* + * Copyright (c) 2016-2020, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SHENG_DEFS_H +#define SHENG_DEFS_H + +/* + * Utility functions used by various versions of Sheng engine + */ +static really_inline +u8 isDeadState(const u8 a) { + return a & SHENG_STATE_DEAD; +} + +static really_inline +u8 isAcceptState(const u8 a) { + return a & SHENG_STATE_ACCEPT; +} + +static really_inline +u8 isAccelState(const u8 a) { + return a & SHENG_STATE_ACCEL; +} + +static really_inline +u8 hasInterestingStates(const u8 a, const u8 b, const u8 c, const u8 d) { + return (a | b | c | d) & (SHENG_STATE_FLAG_MASK); +} + +#if defined(HAVE_AVX512VBMI) +static really_inline +u8 isDeadState32(const u8 a) { + return a & SHENG32_STATE_DEAD; +} + +static really_inline +u8 isAcceptState32(const u8 a) { + return a & SHENG32_STATE_ACCEPT; +} + +static really_inline +u8 isAccelState32(const u8 a) { + return a & SHENG32_STATE_ACCEL; +} + +static really_inline +u8 hasInterestingStates32(const u8 a, const u8 b, const u8 c, const u8 d) { + return (a | b | c | d) & (SHENG32_STATE_FLAG_MASK); +} + +static really_inline +u8 isDeadState64(const u8 a) { + return a & SHENG64_STATE_DEAD; +} + +static really_inline +u8 isAcceptState64(const u8 a) { + return a & SHENG64_STATE_ACCEPT; +} + +static really_inline +u8 hasInterestingStates64(const u8 a, const u8 b, const u8 c, const u8 d) { + return (a | b | c | d) & (SHENG64_STATE_FLAG_MASK); +} +#endif + +/* these functions should be optimized out, used by NO_MATCHES mode */ +static really_inline +u8 dummyFunc4(UNUSED const u8 a, UNUSED const u8 b, UNUSED const u8 c, + UNUSED const u8 d) { + return 0; +} + +static really_inline +u8 dummyFunc(UNUSED const u8 a) { + return 0; +} + +/* + * Sheng function definitions for single byte loops + */ +/* callback output, can die */ +#define SHENG_IMPL sheng_cod +#define DEAD_FUNC isDeadState +#define ACCEPT_FUNC isAcceptState +#if defined(HAVE_AVX512VBMI) +#define SHENG32_IMPL sheng32_cod +#define DEAD_FUNC32 isDeadState32 +#define ACCEPT_FUNC32 isAcceptState32 +#define SHENG64_IMPL sheng64_cod +#define DEAD_FUNC64 isDeadState64 +#define ACCEPT_FUNC64 isAcceptState64 +#endif +#define STOP_AT_MATCH 0 +#include "sheng_impl.h" +#undef SHENG_IMPL +#undef DEAD_FUNC +#undef ACCEPT_FUNC +#if defined(HAVE_AVX512VBMI) +#undef SHENG32_IMPL +#undef DEAD_FUNC32 +#undef ACCEPT_FUNC32 +#undef SHENG64_IMPL +#undef DEAD_FUNC64 +#undef ACCEPT_FUNC64 +#endif +#undef STOP_AT_MATCH + +/* callback output, can't die */ +#define SHENG_IMPL sheng_co +#define DEAD_FUNC dummyFunc +#define ACCEPT_FUNC isAcceptState +#if defined(HAVE_AVX512VBMI) +#define SHENG32_IMPL sheng32_co +#define DEAD_FUNC32 dummyFunc +#define ACCEPT_FUNC32 isAcceptState32 +#define SHENG64_IMPL sheng64_co +#define DEAD_FUNC64 dummyFunc +#define ACCEPT_FUNC64 isAcceptState64 +#endif +#define STOP_AT_MATCH 0 +#include "sheng_impl.h" +#undef SHENG_IMPL +#undef DEAD_FUNC +#undef ACCEPT_FUNC +#if defined(HAVE_AVX512VBMI) +#undef SHENG32_IMPL +#undef DEAD_FUNC32 +#undef ACCEPT_FUNC32 +#undef SHENG64_IMPL +#undef DEAD_FUNC64 +#undef ACCEPT_FUNC64 +#endif +#undef STOP_AT_MATCH + +/* stop at match, can die */ +#define SHENG_IMPL sheng_samd +#define DEAD_FUNC isDeadState +#define ACCEPT_FUNC isAcceptState +#if defined(HAVE_AVX512VBMI) +#define SHENG32_IMPL sheng32_samd +#define DEAD_FUNC32 isDeadState32 +#define ACCEPT_FUNC32 isAcceptState32 +#define SHENG64_IMPL sheng64_samd +#define DEAD_FUNC64 isDeadState64 +#define ACCEPT_FUNC64 isAcceptState64 +#endif +#define STOP_AT_MATCH 1 +#include "sheng_impl.h" +#undef SHENG_IMPL +#undef DEAD_FUNC +#undef ACCEPT_FUNC +#if defined(HAVE_AVX512VBMI) +#undef SHENG32_IMPL +#undef DEAD_FUNC32 +#undef ACCEPT_FUNC32 +#undef SHENG64_IMPL +#undef DEAD_FUNC64 +#undef ACCEPT_FUNC64 +#endif +#undef STOP_AT_MATCH + +/* stop at match, can't die */ +#define SHENG_IMPL sheng_sam +#define DEAD_FUNC dummyFunc +#define ACCEPT_FUNC isAcceptState +#if defined(HAVE_AVX512VBMI) +#define SHENG32_IMPL sheng32_sam +#define DEAD_FUNC32 dummyFunc +#define ACCEPT_FUNC32 isAcceptState32 +#define SHENG64_IMPL sheng64_sam +#define DEAD_FUNC64 dummyFunc +#define ACCEPT_FUNC64 isAcceptState64 +#endif +#define STOP_AT_MATCH 1 +#include "sheng_impl.h" +#undef SHENG_IMPL +#undef DEAD_FUNC +#undef ACCEPT_FUNC +#if defined(HAVE_AVX512VBMI) +#undef SHENG32_IMPL +#undef DEAD_FUNC32 +#undef ACCEPT_FUNC32 +#undef SHENG64_IMPL +#undef DEAD_FUNC64 +#undef ACCEPT_FUNC64 +#endif +#undef STOP_AT_MATCH + +/* no match, can die */ +#define SHENG_IMPL sheng_nmd +#define DEAD_FUNC isDeadState +#define ACCEPT_FUNC dummyFunc +#if defined(HAVE_AVX512VBMI) +#define SHENG32_IMPL sheng32_nmd +#define DEAD_FUNC32 isDeadState32 +#define ACCEPT_FUNC32 dummyFunc +#define SHENG64_IMPL sheng64_nmd +#define DEAD_FUNC64 isDeadState64 +#define ACCEPT_FUNC64 dummyFunc +#endif +#define STOP_AT_MATCH 0 +#include "sheng_impl.h" +#undef SHENG_IMPL +#undef DEAD_FUNC +#undef ACCEPT_FUNC +#if defined(HAVE_AVX512VBMI) +#undef SHENG32_IMPL +#undef DEAD_FUNC32 +#undef ACCEPT_FUNC32 +#undef SHENG64_IMPL +#undef DEAD_FUNC64 +#undef ACCEPT_FUNC64 +#endif +#undef STOP_AT_MATCH + +/* no match, can't die */ +#define SHENG_IMPL sheng_nm +#define DEAD_FUNC dummyFunc +#define ACCEPT_FUNC dummyFunc +#if defined(HAVE_AVX512VBMI) +#define SHENG32_IMPL sheng32_nm +#define DEAD_FUNC32 dummyFunc +#define ACCEPT_FUNC32 dummyFunc +#define SHENG64_IMPL sheng64_nm +#define DEAD_FUNC64 dummyFunc +#define ACCEPT_FUNC64 dummyFunc +#endif +#define STOP_AT_MATCH 0 +#include "sheng_impl.h" +#undef SHENG_IMPL +#undef DEAD_FUNC +#undef ACCEPT_FUNC +#if defined(HAVE_AVX512VBMI) +#undef SHENG32_IMPL +#undef DEAD_FUNC32 +#undef ACCEPT_FUNC32 +#undef SHENG64_IMPL +#undef DEAD_FUNC64 +#undef ACCEPT_FUNC64 +#endif +#undef STOP_AT_MATCH + +/* + * Sheng function definitions for 4-byte loops + */ +/* callback output, can die, accelerated */ +#define SHENG_IMPL sheng4_coda +#define INTERESTING_FUNC hasInterestingStates +#define INNER_DEAD_FUNC isDeadState +#define OUTER_DEAD_FUNC dummyFunc +#define INNER_ACCEL_FUNC isAccelState +#define OUTER_ACCEL_FUNC dummyFunc +#define ACCEPT_FUNC isAcceptState +#if defined(HAVE_AVX512VBMI) +#define SHENG32_IMPL sheng32_4_coda +#define INTERESTING_FUNC32 hasInterestingStates32 +#define INNER_DEAD_FUNC32 isDeadState32 +#define OUTER_DEAD_FUNC32 dummyFunc +#define INNER_ACCEL_FUNC32 isAccelState32 +#define OUTER_ACCEL_FUNC32 dummyFunc +#define ACCEPT_FUNC32 isAcceptState32 +#define NO_SHENG64_IMPL +#endif +#define STOP_AT_MATCH 0 +#include "sheng_impl4.h" +#undef SHENG_IMPL +#undef INTERESTING_FUNC +#undef INNER_DEAD_FUNC +#undef OUTER_DEAD_FUNC +#undef INNER_ACCEL_FUNC +#undef OUTER_ACCEL_FUNC +#undef ACCEPT_FUNC +#if defined(HAVE_AVX512VBMI) +#undef SHENG32_IMPL +#undef INTERESTING_FUNC32 +#undef INNER_DEAD_FUNC32 +#undef OUTER_DEAD_FUNC32 +#undef INNER_ACCEL_FUNC32 +#undef OUTER_ACCEL_FUNC32 +#undef ACCEPT_FUNC32 +#undef NO_SHENG64_IMPL +#endif +#undef STOP_AT_MATCH + +/* callback output, can die, not accelerated */ +#define SHENG_IMPL sheng4_cod +#define INTERESTING_FUNC hasInterestingStates +#define INNER_DEAD_FUNC isDeadState +#define OUTER_DEAD_FUNC dummyFunc +#define INNER_ACCEL_FUNC dummyFunc +#define OUTER_ACCEL_FUNC dummyFunc +#define ACCEPT_FUNC isAcceptState +#if defined(HAVE_AVX512VBMI) +#define SHENG32_IMPL sheng32_4_cod +#define INTERESTING_FUNC32 hasInterestingStates32 +#define INNER_DEAD_FUNC32 isDeadState32 +#define OUTER_DEAD_FUNC32 dummyFunc +#define INNER_ACCEL_FUNC32 dummyFunc +#define OUTER_ACCEL_FUNC32 dummyFunc +#define ACCEPT_FUNC32 isAcceptState32 +#define SHENG64_IMPL sheng64_4_cod +#define INTERESTING_FUNC64 hasInterestingStates64 +#define INNER_DEAD_FUNC64 isDeadState64 +#define OUTER_DEAD_FUNC64 dummyFunc +#define ACCEPT_FUNC64 isAcceptState64 +#endif +#define STOP_AT_MATCH 0 +#include "sheng_impl4.h" +#undef SHENG_IMPL +#undef INTERESTING_FUNC +#undef INNER_DEAD_FUNC +#undef OUTER_DEAD_FUNC +#undef INNER_ACCEL_FUNC +#undef OUTER_ACCEL_FUNC +#undef ACCEPT_FUNC +#if defined(HAVE_AVX512VBMI) +#undef SHENG32_IMPL +#undef INTERESTING_FUNC32 +#undef INNER_DEAD_FUNC32 +#undef OUTER_DEAD_FUNC32 +#undef INNER_ACCEL_FUNC32 +#undef OUTER_ACCEL_FUNC32 +#undef ACCEPT_FUNC32 +#undef SHENG64_IMPL +#undef INTERESTING_FUNC64 +#undef INNER_DEAD_FUNC64 +#undef OUTER_DEAD_FUNC64 +#undef ACCEPT_FUNC64 +#endif +#undef STOP_AT_MATCH + +/* callback output, can't die, accelerated */ +#define SHENG_IMPL sheng4_coa +#define INTERESTING_FUNC hasInterestingStates +#define INNER_DEAD_FUNC dummyFunc +#define OUTER_DEAD_FUNC dummyFunc +#define INNER_ACCEL_FUNC isAccelState +#define OUTER_ACCEL_FUNC dummyFunc +#define ACCEPT_FUNC isAcceptState +#if defined(HAVE_AVX512VBMI) +#define SHENG32_IMPL sheng32_4_coa +#define INTERESTING_FUNC32 hasInterestingStates32 +#define INNER_DEAD_FUNC32 dummyFunc +#define OUTER_DEAD_FUNC32 dummyFunc +#define INNER_ACCEL_FUNC32 isAccelState32 +#define OUTER_ACCEL_FUNC32 dummyFunc +#define ACCEPT_FUNC32 isAcceptState32 +#define NO_SHENG64_IMPL +#endif +#define STOP_AT_MATCH 0 +#include "sheng_impl4.h" +#undef SHENG_IMPL +#undef INTERESTING_FUNC +#undef INNER_DEAD_FUNC +#undef OUTER_DEAD_FUNC +#undef INNER_ACCEL_FUNC +#undef OUTER_ACCEL_FUNC +#undef ACCEPT_FUNC +#if defined(HAVE_AVX512VBMI) +#undef SHENG32_IMPL +#undef INTERESTING_FUNC32 +#undef INNER_DEAD_FUNC32 +#undef OUTER_DEAD_FUNC32 +#undef INNER_ACCEL_FUNC32 +#undef OUTER_ACCEL_FUNC32 +#undef ACCEPT_FUNC32 +#undef NO_SHENG64_IMPL +#endif +#undef STOP_AT_MATCH + +/* callback output, can't die, not accelerated */ +#define SHENG_IMPL sheng4_co +#define INTERESTING_FUNC hasInterestingStates +#define INNER_DEAD_FUNC dummyFunc +#define OUTER_DEAD_FUNC dummyFunc +#define INNER_ACCEL_FUNC dummyFunc +#define OUTER_ACCEL_FUNC dummyFunc +#define ACCEPT_FUNC isAcceptState +#if defined(HAVE_AVX512VBMI) +#define SHENG32_IMPL sheng32_4_co +#define INTERESTING_FUNC32 hasInterestingStates32 +#define INNER_DEAD_FUNC32 dummyFunc +#define OUTER_DEAD_FUNC32 dummyFunc +#define INNER_ACCEL_FUNC32 dummyFunc +#define OUTER_ACCEL_FUNC32 dummyFunc +#define ACCEPT_FUNC32 isAcceptState32 +#define SHENG64_IMPL sheng64_4_co +#define INTERESTING_FUNC64 hasInterestingStates64 +#define INNER_DEAD_FUNC64 dummyFunc +#define OUTER_DEAD_FUNC64 dummyFunc +#define ACCEPT_FUNC64 isAcceptState64 +#endif +#define STOP_AT_MATCH 0 +#include "sheng_impl4.h" +#undef SHENG_IMPL +#undef INTERESTING_FUNC +#undef INNER_DEAD_FUNC +#undef OUTER_DEAD_FUNC +#undef INNER_ACCEL_FUNC +#undef OUTER_ACCEL_FUNC +#undef ACCEPT_FUNC +#if defined(HAVE_AVX512VBMI) +#undef SHENG32_IMPL +#undef INTERESTING_FUNC32 +#undef INNER_DEAD_FUNC32 +#undef OUTER_DEAD_FUNC32 +#undef INNER_ACCEL_FUNC32 +#undef OUTER_ACCEL_FUNC32 +#undef ACCEPT_FUNC32 +#undef SHENG64_IMPL +#undef INTERESTING_FUNC64 +#undef INNER_DEAD_FUNC64 +#undef OUTER_DEAD_FUNC64 +#undef ACCEPT_FUNC64 +#endif +#undef STOP_AT_MATCH + +/* stop at match, can die, accelerated */ +#define SHENG_IMPL sheng4_samda +#define INTERESTING_FUNC hasInterestingStates +#define INNER_DEAD_FUNC isDeadState +#define OUTER_DEAD_FUNC dummyFunc +#define INNER_ACCEL_FUNC isAccelState +#define OUTER_ACCEL_FUNC dummyFunc +#define ACCEPT_FUNC isAcceptState +#if defined(HAVE_AVX512VBMI) +#define SHENG32_IMPL sheng32_4_samda +#define INTERESTING_FUNC32 hasInterestingStates32 +#define INNER_DEAD_FUNC32 isDeadState32 +#define OUTER_DEAD_FUNC32 dummyFunc +#define INNER_ACCEL_FUNC32 isAccelState32 +#define OUTER_ACCEL_FUNC32 dummyFunc +#define ACCEPT_FUNC32 isAcceptState32 +#define NO_SHENG64_IMPL +#endif +#define STOP_AT_MATCH 1 +#include "sheng_impl4.h" +#undef SHENG_IMPL +#undef INTERESTING_FUNC +#undef INNER_DEAD_FUNC +#undef OUTER_DEAD_FUNC +#undef INNER_ACCEL_FUNC +#undef OUTER_ACCEL_FUNC +#undef ACCEPT_FUNC +#if defined(HAVE_AVX512VBMI) +#undef SHENG32_IMPL +#undef INTERESTING_FUNC32 +#undef INNER_DEAD_FUNC32 +#undef OUTER_DEAD_FUNC32 +#undef INNER_ACCEL_FUNC32 +#undef OUTER_ACCEL_FUNC32 +#undef ACCEPT_FUNC32 +#undef NO_SHENG64_IMPL +#endif +#undef STOP_AT_MATCH + +/* stop at match, can die, not accelerated */ +#define SHENG_IMPL sheng4_samd +#define INTERESTING_FUNC hasInterestingStates +#define INNER_DEAD_FUNC isDeadState +#define OUTER_DEAD_FUNC dummyFunc +#define INNER_ACCEL_FUNC dummyFunc +#define OUTER_ACCEL_FUNC dummyFunc +#define ACCEPT_FUNC isAcceptState +#if defined(HAVE_AVX512VBMI) +#define SHENG32_IMPL sheng32_4_samd +#define INTERESTING_FUNC32 hasInterestingStates32 +#define INNER_DEAD_FUNC32 isDeadState32 +#define OUTER_DEAD_FUNC32 dummyFunc +#define INNER_ACCEL_FUNC32 dummyFunc +#define OUTER_ACCEL_FUNC32 dummyFunc +#define ACCEPT_FUNC32 isAcceptState32 +#define SHENG64_IMPL sheng64_4_samd +#define INTERESTING_FUNC64 hasInterestingStates64 +#define INNER_DEAD_FUNC64 isDeadState64 +#define OUTER_DEAD_FUNC64 dummyFunc +#define ACCEPT_FUNC64 isAcceptState64 +#endif +#define STOP_AT_MATCH 1 +#include "sheng_impl4.h" +#undef SHENG_IMPL +#undef INTERESTING_FUNC +#undef INNER_DEAD_FUNC +#undef OUTER_DEAD_FUNC +#undef INNER_ACCEL_FUNC +#undef OUTER_ACCEL_FUNC +#undef ACCEPT_FUNC +#if defined(HAVE_AVX512VBMI) +#undef SHENG32_IMPL +#undef INTERESTING_FUNC32 +#undef INNER_DEAD_FUNC32 +#undef OUTER_DEAD_FUNC32 +#undef INNER_ACCEL_FUNC32 +#undef OUTER_ACCEL_FUNC32 +#undef ACCEPT_FUNC32 +#undef SHENG64_IMPL +#undef INTERESTING_FUNC64 +#undef INNER_DEAD_FUNC64 +#undef OUTER_DEAD_FUNC64 +#undef ACCEPT_FUNC64 +#endif +#undef STOP_AT_MATCH + +/* stop at match, can't die, accelerated */ +#define SHENG_IMPL sheng4_sama +#define INTERESTING_FUNC hasInterestingStates +#define INNER_DEAD_FUNC dummyFunc +#define OUTER_DEAD_FUNC dummyFunc +#define INNER_ACCEL_FUNC isAccelState +#define OUTER_ACCEL_FUNC dummyFunc +#define ACCEPT_FUNC isAcceptState +#if defined(HAVE_AVX512VBMI) +#define SHENG32_IMPL sheng32_4_sama +#define INTERESTING_FUNC32 hasInterestingStates32 +#define INNER_DEAD_FUNC32 dummyFunc +#define OUTER_DEAD_FUNC32 dummyFunc +#define INNER_ACCEL_FUNC32 isAccelState32 +#define OUTER_ACCEL_FUNC32 dummyFunc +#define ACCEPT_FUNC32 isAcceptState32 +#define NO_SHENG64_IMPL +#endif +#define STOP_AT_MATCH 1 +#include "sheng_impl4.h" +#undef SHENG_IMPL +#undef INTERESTING_FUNC +#undef INNER_DEAD_FUNC +#undef OUTER_DEAD_FUNC +#undef INNER_ACCEL_FUNC +#undef OUTER_ACCEL_FUNC +#undef ACCEPT_FUNC +#if defined(HAVE_AVX512VBMI) +#undef SHENG32_IMPL +#undef INTERESTING_FUNC32 +#undef INNER_DEAD_FUNC32 +#undef OUTER_DEAD_FUNC32 +#undef INNER_ACCEL_FUNC32 +#undef OUTER_ACCEL_FUNC32 +#undef ACCEPT_FUNC32 +#undef NO_SHENG64_IMPL +#endif +#undef STOP_AT_MATCH + +/* stop at match, can't die, not accelerated */ +#define SHENG_IMPL sheng4_sam +#define INTERESTING_FUNC hasInterestingStates +#define INNER_DEAD_FUNC dummyFunc +#define OUTER_DEAD_FUNC dummyFunc +#define INNER_ACCEL_FUNC dummyFunc +#define OUTER_ACCEL_FUNC dummyFunc +#define ACCEPT_FUNC isAcceptState +#if defined(HAVE_AVX512VBMI) +#define SHENG32_IMPL sheng32_4_sam +#define INTERESTING_FUNC32 hasInterestingStates32 +#define INNER_DEAD_FUNC32 dummyFunc +#define OUTER_DEAD_FUNC32 dummyFunc +#define INNER_ACCEL_FUNC32 dummyFunc +#define OUTER_ACCEL_FUNC32 dummyFunc +#define ACCEPT_FUNC32 isAcceptState32 +#define SHENG64_IMPL sheng64_4_sam +#define INTERESTING_FUNC64 hasInterestingStates64 +#define INNER_DEAD_FUNC64 dummyFunc +#define OUTER_DEAD_FUNC64 dummyFunc +#define ACCEPT_FUNC64 isAcceptState64 +#endif +#define STOP_AT_MATCH 1 +#include "sheng_impl4.h" +#undef SHENG_IMPL +#undef INTERESTING_FUNC +#undef INNER_DEAD_FUNC +#undef OUTER_DEAD_FUNC +#undef INNER_ACCEL_FUNC +#undef OUTER_ACCEL_FUNC +#undef ACCEPT_FUNC +#if defined(HAVE_AVX512VBMI) +#undef SHENG32_IMPL +#undef INTERESTING_FUNC32 +#undef INNER_DEAD_FUNC32 +#undef OUTER_DEAD_FUNC32 +#undef INNER_ACCEL_FUNC32 +#undef OUTER_ACCEL_FUNC32 +#undef ACCEPT_FUNC32 +#undef SHENG64_IMPL +#undef INTERESTING_FUNC64 +#undef INNER_DEAD_FUNC64 +#undef OUTER_DEAD_FUNC64 +#undef ACCEPT_FUNC64 +#endif +#undef STOP_AT_MATCH + +/* no-match have interesting func as dummy, and die/accel checks are outer */ + +/* no match, can die, accelerated */ +#define SHENG_IMPL sheng4_nmda +#define INTERESTING_FUNC dummyFunc4 +#define INNER_DEAD_FUNC dummyFunc +#define OUTER_DEAD_FUNC isDeadState +#define INNER_ACCEL_FUNC dummyFunc +#define OUTER_ACCEL_FUNC isAccelState +#define ACCEPT_FUNC dummyFunc +#if defined(HAVE_AVX512VBMI) +#define SHENG32_IMPL sheng32_4_nmda +#define INTERESTING_FUNC32 dummyFunc4 +#define INNER_DEAD_FUNC32 dummyFunc +#define OUTER_DEAD_FUNC32 isDeadState32 +#define INNER_ACCEL_FUNC32 dummyFunc +#define OUTER_ACCEL_FUNC32 isAccelState32 +#define ACCEPT_FUNC32 dummyFunc +#define NO_SHENG64_IMPL +#endif +#define STOP_AT_MATCH 0 +#include "sheng_impl4.h" +#undef SHENG_IMPL +#undef INTERESTING_FUNC +#undef INNER_DEAD_FUNC +#undef OUTER_DEAD_FUNC +#undef INNER_ACCEL_FUNC +#undef OUTER_ACCEL_FUNC +#undef ACCEPT_FUNC +#if defined(HAVE_AVX512VBMI) +#undef SHENG32_IMPL +#undef INTERESTING_FUNC32 +#undef INNER_DEAD_FUNC32 +#undef OUTER_DEAD_FUNC32 +#undef INNER_ACCEL_FUNC32 +#undef OUTER_ACCEL_FUNC32 +#undef ACCEPT_FUNC32 +#undef NO_SHENG64_IMPL +#endif +#undef STOP_AT_MATCH + +/* no match, can die, not accelerated */ +#define SHENG_IMPL sheng4_nmd +#define INTERESTING_FUNC dummyFunc4 +#define INNER_DEAD_FUNC dummyFunc +#define OUTER_DEAD_FUNC isDeadState +#define INNER_ACCEL_FUNC dummyFunc +#define OUTER_ACCEL_FUNC dummyFunc +#define ACCEPT_FUNC dummyFunc +#if defined(HAVE_AVX512VBMI) +#define SHENG32_IMPL sheng32_4_nmd +#define INTERESTING_FUNC32 dummyFunc4 +#define INNER_DEAD_FUNC32 dummyFunc +#define OUTER_DEAD_FUNC32 isDeadState32 +#define INNER_ACCEL_FUNC32 dummyFunc +#define OUTER_ACCEL_FUNC32 dummyFunc +#define ACCEPT_FUNC32 dummyFunc +#define SHENG64_IMPL sheng64_4_nmd +#define INTERESTING_FUNC64 dummyFunc4 +#define INNER_DEAD_FUNC64 dummyFunc +#define OUTER_DEAD_FUNC64 isDeadState64 +#define ACCEPT_FUNC64 dummyFunc +#endif +#define STOP_AT_MATCH 0 +#include "sheng_impl4.h" +#undef SHENG_IMPL +#undef INTERESTING_FUNC +#undef INNER_DEAD_FUNC +#undef OUTER_DEAD_FUNC +#undef INNER_ACCEL_FUNC +#undef OUTER_ACCEL_FUNC +#undef ACCEPT_FUNC +#if defined(HAVE_AVX512VBMI) +#undef SHENG32_IMPL +#undef INTERESTING_FUNC32 +#undef INNER_DEAD_FUNC32 +#undef OUTER_DEAD_FUNC32 +#undef INNER_ACCEL_FUNC32 +#undef OUTER_ACCEL_FUNC32 +#undef ACCEPT_FUNC32 +#undef SHENG64_IMPL +#undef INTERESTING_FUNC64 +#undef INNER_DEAD_FUNC64 +#undef OUTER_DEAD_FUNC64 +#undef ACCEPT_FUNC64 +#endif +#undef STOP_AT_MATCH + +/* there is no performance benefit in accelerating a no-match case that can't + * die */ + +/* no match, can't die */ +#define SHENG_IMPL sheng4_nm +#define INTERESTING_FUNC dummyFunc4 +#define INNER_DEAD_FUNC dummyFunc +#define OUTER_DEAD_FUNC dummyFunc +#define INNER_ACCEL_FUNC dummyFunc +#define OUTER_ACCEL_FUNC dummyFunc +#define ACCEPT_FUNC dummyFunc +#if defined(HAVE_AVX512VBMI) +#define SHENG32_IMPL sheng32_4_nm +#define INTERESTING_FUNC32 dummyFunc4 +#define INNER_DEAD_FUNC32 dummyFunc +#define OUTER_DEAD_FUNC32 dummyFunc +#define INNER_ACCEL_FUNC32 dummyFunc +#define OUTER_ACCEL_FUNC32 dummyFunc +#define ACCEPT_FUNC32 dummyFunc +#define SHENG64_IMPL sheng64_4_nm +#define INTERESTING_FUNC64 dummyFunc4 +#define INNER_DEAD_FUNC64 dummyFunc +#define OUTER_DEAD_FUNC64 dummyFunc +#define ACCEPT_FUNC64 dummyFunc +#endif +#define STOP_AT_MATCH 0 +#include "sheng_impl4.h" +#undef SHENG_IMPL +#undef INTERESTING_FUNC +#undef INNER_DEAD_FUNC +#undef OUTER_DEAD_FUNC +#undef INNER_ACCEL_FUNC +#undef OUTER_ACCEL_FUNC +#undef ACCEPT_FUNC +#if defined(HAVE_AVX512VBMI) +#undef SHENG32_IMPL +#undef INTERESTING_FUNC32 +#undef INNER_DEAD_FUNC32 +#undef OUTER_DEAD_FUNC32 +#undef INNER_ACCEL_FUNC32 +#undef OUTER_ACCEL_FUNC32 +#undef ACCEPT_FUNC32 +#undef SHENG64_IMPL +#undef INTERESTING_FUNC64 +#undef INNER_DEAD_FUNC64 +#undef OUTER_DEAD_FUNC64 +#undef ACCEPT_FUNC64 +#endif +#undef STOP_AT_MATCH + +#endif // SHENG_DEFS_H diff --git a/regex/nfa/sheng_impl.h b/regex/nfa/sheng_impl.h new file mode 100644 index 000000000..fb8ee1683 --- /dev/null +++ b/regex/nfa/sheng_impl.h @@ -0,0 +1,221 @@ +/* + * Copyright (c) 2016-2020, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * In order to use this macro, the following things need to be defined: + * + * - SHENG_IMPL (name of the Sheng implementation function) + * - DEAD_FUNC (name of the function checking for dead states) + * - ACCEPT_FUNC (name of the function checking for accept state) + * - STOP_AT_MATCH (can be 1 or 0, enable or disable stop at match) + */ + +/* byte-by-byte version. we don't do byte-by-byte death checking as it's + * pretty pointless to do it over a buffer that's at most 3 bytes long */ +static really_inline +char SHENG_IMPL(u8 *state, NfaCallback cb, void *ctxt, const struct sheng *s, + u8 *const cached_accept_state, ReportID *const cached_accept_id, + u8 single, u64a base_offset, const u8 *buf, const u8 *start, + const u8 *end, const u8 **scan_end) { + DEBUG_PRINTF("Starting DFA execution in state %u\n", + *state & SHENG_STATE_MASK); + const u8 *cur_buf = start; + if (DEAD_FUNC(*state)) { + DEBUG_PRINTF("Dead on arrival\n"); + *scan_end = end; + return MO_CONTINUE_MATCHING; + } + DEBUG_PRINTF("Scanning %lli bytes\n", (s64a)(end - start)); + + m128 cur_state = set16x8(*state); + const m128 *masks = s->shuffle_masks; + + while (likely(cur_buf != end)) { + const u8 c = *cur_buf; + const m128 shuffle_mask = masks[c]; + cur_state = pshufb_m128(shuffle_mask, cur_state); + const u8 tmp = movd(cur_state); + + DEBUG_PRINTF("c: %02hhx '%c'\n", c, ourisprint(c) ? c : '?'); + DEBUG_PRINTF("s: %u (hi: %u lo: %u)\n", tmp, (tmp & 0xF0) >> 4, + tmp & 0xF); + + if (unlikely(ACCEPT_FUNC(tmp))) { + DEBUG_PRINTF("Accept state %u reached\n", tmp & SHENG_STATE_MASK); + u64a match_offset = base_offset + (cur_buf - buf) + 1; + DEBUG_PRINTF("Match @ %llu\n", match_offset); + if (STOP_AT_MATCH) { + DEBUG_PRINTF("Stopping at match @ %lli\n", + (u64a)(cur_buf - start)); + *state = tmp; + *scan_end = cur_buf; + return MO_MATCHES_PENDING; + } + if (single) { + if (fireSingleReport(cb, ctxt, s->report, match_offset) == + MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } else { + if (fireReports(s, cb, ctxt, tmp, match_offset, + cached_accept_state, cached_accept_id, + 0) == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } + } + cur_buf++; + } + *state = movd(cur_state); + *scan_end = cur_buf; + return MO_CONTINUE_MATCHING; +} + +#if defined(HAVE_AVX512VBMI) +static really_inline +char SHENG32_IMPL(u8 *state, NfaCallback cb, void *ctxt, + const struct sheng32 *s, + u8 *const cached_accept_state, + ReportID *const cached_accept_id, + u8 single, u64a base_offset, const u8 *buf, const u8 *start, + const u8 *end, const u8 **scan_end) { + DEBUG_PRINTF("Starting DFA execution in state %u\n", + *state & SHENG32_STATE_MASK); + const u8 *cur_buf = start; + if (DEAD_FUNC32(*state)) { + DEBUG_PRINTF("Dead on arrival\n"); + *scan_end = end; + return MO_CONTINUE_MATCHING; + } + DEBUG_PRINTF("Scanning %lli bytes\n", (s64a)(end - start)); + + m512 cur_state = set64x8(*state); + const m512 *masks = s->succ_masks; + + while (likely(cur_buf != end)) { + const u8 c = *cur_buf; + const m512 succ_mask = masks[c]; + cur_state = vpermb512(cur_state, succ_mask); + const u8 tmp = movd512(cur_state); + + DEBUG_PRINTF("c: %02hhx '%c'\n", c, ourisprint(c) ? c : '?'); + DEBUG_PRINTF("s: %u (flag: %u)\n", tmp & SHENG32_STATE_MASK, + tmp & SHENG32_STATE_FLAG_MASK); + + if (unlikely(ACCEPT_FUNC32(tmp))) { + DEBUG_PRINTF("Accept state %u reached\n", tmp & SHENG32_STATE_MASK); + u64a match_offset = base_offset + (cur_buf - buf) + 1; + DEBUG_PRINTF("Match @ %llu\n", match_offset); + if (STOP_AT_MATCH) { + DEBUG_PRINTF("Stopping at match @ %lli\n", + (u64a)(cur_buf - start)); + *state = tmp; + *scan_end = cur_buf; + return MO_MATCHES_PENDING; + } + if (single) { + if (fireSingleReport(cb, ctxt, s->report, match_offset) == + MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } else { + if (fireReports32(s, cb, ctxt, tmp, match_offset, + cached_accept_state, cached_accept_id, + 0) == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } + } + cur_buf++; + } + *state = movd512(cur_state); + *scan_end = cur_buf; + return MO_CONTINUE_MATCHING; +} + +static really_inline +char SHENG64_IMPL(u8 *state, NfaCallback cb, void *ctxt, + const struct sheng64 *s, + u8 *const cached_accept_state, + ReportID *const cached_accept_id, + u8 single, u64a base_offset, const u8 *buf, const u8 *start, + const u8 *end, const u8 **scan_end) { + DEBUG_PRINTF("Starting DFA execution in state %u\n", + *state & SHENG64_STATE_MASK); + const u8 *cur_buf = start; + if (DEAD_FUNC64(*state)) { + DEBUG_PRINTF("Dead on arrival\n"); + *scan_end = end; + return MO_CONTINUE_MATCHING; + } + DEBUG_PRINTF("Scanning %lli bytes\n", (s64a)(end - start)); + + m512 cur_state = set64x8(*state); + const m512 *masks = s->succ_masks; + + while (likely(cur_buf != end)) { + const u8 c = *cur_buf; + const m512 succ_mask = masks[c]; + cur_state = vpermb512(cur_state, succ_mask); + const u8 tmp = movd512(cur_state); + + DEBUG_PRINTF("c: %02hhx '%c'\n", c, ourisprint(c) ? c : '?'); + DEBUG_PRINTF("s: %u (flag: %u)\n", tmp & SHENG64_STATE_MASK, + tmp & SHENG64_STATE_FLAG_MASK); + + if (unlikely(ACCEPT_FUNC64(tmp))) { + DEBUG_PRINTF("Accept state %u reached\n", tmp & SHENG64_STATE_MASK); + u64a match_offset = base_offset + (cur_buf - buf) + 1; + DEBUG_PRINTF("Match @ %llu\n", match_offset); + if (STOP_AT_MATCH) { + DEBUG_PRINTF("Stopping at match @ %lli\n", + (u64a)(cur_buf - start)); + *state = tmp; + *scan_end = cur_buf; + return MO_MATCHES_PENDING; + } + if (single) { + if (fireSingleReport(cb, ctxt, s->report, match_offset) == + MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } else { + if (fireReports64(s, cb, ctxt, tmp, match_offset, + cached_accept_state, cached_accept_id, + 0) == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } + } + cur_buf++; + } + *state = movd512(cur_state); + *scan_end = cur_buf; + return MO_CONTINUE_MATCHING; +} +#endif diff --git a/regex/nfa/sheng_impl4.h b/regex/nfa/sheng_impl4.h new file mode 100644 index 000000000..440e7396e --- /dev/null +++ b/regex/nfa/sheng_impl4.h @@ -0,0 +1,711 @@ +/* + * Copyright (c) 2016-2020, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * In order to use this macro, the following things need to be defined: + * + * - SHENG_IMPL (name of the Sheng implementation function) + * - INTERESTING_FUNC (name of the function checking for accept, accel or dead + * states) + * - INNER_DEAD_FUNC (name of the inner function checking for dead states) + * - OUTER_DEAD_FUNC (name of the outer function checking for dead states) + * - INNER_ACCEL_FUNC (name of the inner function checking for accel states) + * - OUTER_ACCEL_FUNC (name of the outer function checking for accel states) + * - ACCEPT_FUNC (name of the function checking for accept state) + * - STOP_AT_MATCH (can be 1 or 0, enable or disable stop at match) + */ + +/* unrolled 4-byte-at-a-time version. + * + * we put innerDeadFunc inside interestingFunc() block so that we don't pay for + * dead states checking. however, if interestingFunc is dummy, innerDeadFunc + * gets lost with it, so we need an additional check outside the + * interestingFunc() branch - it's normally dummy so we don't pay for it, but + * when interestingFunc is dummy, outerDeadFunc should be set if we want to + * check for dead states. + * + * also, deadFunc only checks the last known state, but since we can't ever get + * out of the dead state and we don't really care where we died, it's not a + * problem. + */ +static really_inline +char SHENG_IMPL(u8 *state, NfaCallback cb, void *ctxt, const struct sheng *s, + u8 *const cached_accept_state, ReportID *const cached_accept_id, + u8 single, u64a base_offset, const u8 *buf, const u8 *start, + const u8 *end, const u8 **scan_end) { + DEBUG_PRINTF("Starting DFAx4 execution in state %u\n", + *state & SHENG_STATE_MASK); + const u8 *cur_buf = start; + const u8 *min_accel_dist = start; + base_offset++; + DEBUG_PRINTF("Scanning %llu bytes\n", (u64a)(end - start)); + + if (INNER_ACCEL_FUNC(*state) || OUTER_ACCEL_FUNC(*state)) { + DEBUG_PRINTF("Accel state reached @ 0\n"); + const union AccelAux *aaux = get_accel(s, *state & SHENG_STATE_MASK); + const u8 *new_offset = run_accel(aaux, cur_buf, end); + if (new_offset < cur_buf + BAD_ACCEL_DIST) { + min_accel_dist = new_offset + BIG_ACCEL_PENALTY; + } else { + min_accel_dist = new_offset + SMALL_ACCEL_PENALTY; + } + DEBUG_PRINTF("Next accel chance: %llu\n", + (u64a)(min_accel_dist - start)); + DEBUG_PRINTF("Accel scanned %zu bytes\n", new_offset - cur_buf); + cur_buf = new_offset; + DEBUG_PRINTF("New offset: %lli\n", (s64a)(cur_buf - start)); + } + if (INNER_DEAD_FUNC(*state) || OUTER_DEAD_FUNC(*state)) { + DEBUG_PRINTF("Dead on arrival\n"); + *scan_end = end; + return MO_CONTINUE_MATCHING; + } + + m128 cur_state = set16x8(*state); + const m128 *masks = s->shuffle_masks; + + while (likely(end - cur_buf >= 4)) { + const u8 *b1 = cur_buf; + const u8 *b2 = cur_buf + 1; + const u8 *b3 = cur_buf + 2; + const u8 *b4 = cur_buf + 3; + const u8 c1 = *b1; + const u8 c2 = *b2; + const u8 c3 = *b3; + const u8 c4 = *b4; + + const m128 shuffle_mask1 = masks[c1]; + cur_state = pshufb_m128(shuffle_mask1, cur_state); + const u8 a1 = movd(cur_state); + + const m128 shuffle_mask2 = masks[c2]; + cur_state = pshufb_m128(shuffle_mask2, cur_state); + const u8 a2 = movd(cur_state); + + const m128 shuffle_mask3 = masks[c3]; + cur_state = pshufb_m128(shuffle_mask3, cur_state); + const u8 a3 = movd(cur_state); + + const m128 shuffle_mask4 = masks[c4]; + cur_state = pshufb_m128(shuffle_mask4, cur_state); + const u8 a4 = movd(cur_state); + + DEBUG_PRINTF("c: %02hhx '%c'\n", c1, ourisprint(c1) ? c1 : '?'); + DEBUG_PRINTF("s: %u (hi: %u lo: %u)\n", a1, (a1 & 0xF0) >> 4, a1 & 0xF); + + DEBUG_PRINTF("c: %02hhx '%c'\n", c2, ourisprint(c2) ? c2 : '?'); + DEBUG_PRINTF("s: %u (hi: %u lo: %u)\n", a2, (a2 & 0xF0) >> 4, a2 & 0xF); + + DEBUG_PRINTF("c: %02hhx '%c'\n", c3, ourisprint(c3) ? c3 : '?'); + DEBUG_PRINTF("s: %u (hi: %u lo: %u)\n", a3, (a3 & 0xF0) >> 4, a3 & 0xF); + + DEBUG_PRINTF("c: %02hhx '%c'\n", c4, ourisprint(c4) ? c4 : '?'); + DEBUG_PRINTF("s: %u (hi: %u lo: %u)\n", a4, (a4 & 0xF0) >> 4, a4 & 0xF); + + if (unlikely(INTERESTING_FUNC(a1, a2, a3, a4))) { + if (ACCEPT_FUNC(a1)) { + u64a match_offset = base_offset + b1 - buf; + DEBUG_PRINTF("Accept state %u reached\n", + a1 & SHENG_STATE_MASK); + DEBUG_PRINTF("Match @ %llu\n", match_offset); + if (STOP_AT_MATCH) { + DEBUG_PRINTF("Stopping at match @ %lli\n", + (s64a)(b1 - start)); + *scan_end = b1; + *state = a1; + return MO_MATCHES_PENDING; + } + if (single) { + if (fireSingleReport(cb, ctxt, s->report, match_offset) == + MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } else { + if (fireReports(s, cb, ctxt, a1, match_offset, + cached_accept_state, cached_accept_id, + 0) == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } + } + if (ACCEPT_FUNC(a2)) { + u64a match_offset = base_offset + b2 - buf; + DEBUG_PRINTF("Accept state %u reached\n", + a2 & SHENG_STATE_MASK); + DEBUG_PRINTF("Match @ %llu\n", match_offset); + if (STOP_AT_MATCH) { + DEBUG_PRINTF("Stopping at match @ %lli\n", + (s64a)(b2 - start)); + *scan_end = b2; + *state = a2; + return MO_MATCHES_PENDING; + } + if (single) { + if (fireSingleReport(cb, ctxt, s->report, match_offset) == + MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } else { + if (fireReports(s, cb, ctxt, a2, match_offset, + cached_accept_state, cached_accept_id, + 0) == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } + } + if (ACCEPT_FUNC(a3)) { + u64a match_offset = base_offset + b3 - buf; + DEBUG_PRINTF("Accept state %u reached\n", + a3 & SHENG_STATE_MASK); + DEBUG_PRINTF("Match @ %llu\n", match_offset); + if (STOP_AT_MATCH) { + DEBUG_PRINTF("Stopping at match @ %lli\n", + (s64a)(b3 - start)); + *scan_end = b3; + *state = a3; + return MO_MATCHES_PENDING; + } + if (single) { + if (fireSingleReport(cb, ctxt, s->report, match_offset) == + MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } else { + if (fireReports(s, cb, ctxt, a3, match_offset, + cached_accept_state, cached_accept_id, + 0) == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } + } + if (ACCEPT_FUNC(a4)) { + u64a match_offset = base_offset + b4 - buf; + DEBUG_PRINTF("Accept state %u reached\n", + a4 & SHENG_STATE_MASK); + DEBUG_PRINTF("Match @ %llu\n", match_offset); + if (STOP_AT_MATCH) { + DEBUG_PRINTF("Stopping at match @ %lli\n", + (s64a)(b4 - start)); + *scan_end = b4; + *state = a4; + return MO_MATCHES_PENDING; + } + if (single) { + if (fireSingleReport(cb, ctxt, s->report, match_offset) == + MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } else { + if (fireReports(s, cb, ctxt, a4, match_offset, + cached_accept_state, cached_accept_id, + 0) == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } + } + if (INNER_DEAD_FUNC(a4)) { + DEBUG_PRINTF("Dead state reached @ %lli\n", (s64a)(b4 - buf)); + *scan_end = end; + *state = a4; + return MO_CONTINUE_MATCHING; + } + if (cur_buf > min_accel_dist && INNER_ACCEL_FUNC(a4)) { + DEBUG_PRINTF("Accel state reached @ %lli\n", (s64a)(b4 - buf)); + const union AccelAux *aaux = + get_accel(s, a4 & SHENG_STATE_MASK); + const u8 *new_offset = run_accel(aaux, cur_buf + 4, end); + if (new_offset < cur_buf + 4 + BAD_ACCEL_DIST) { + min_accel_dist = new_offset + BIG_ACCEL_PENALTY; + } else { + min_accel_dist = new_offset + SMALL_ACCEL_PENALTY; + } + DEBUG_PRINTF("Next accel chance: %llu\n", + (u64a)(min_accel_dist - start)); + DEBUG_PRINTF("Accel scanned %llu bytes\n", + (u64a)(new_offset - cur_buf - 4)); + cur_buf = new_offset; + DEBUG_PRINTF("New offset: %llu\n", (u64a)(cur_buf - buf)); + continue; + } + } + if (OUTER_DEAD_FUNC(a4)) { + DEBUG_PRINTF("Dead state reached @ %lli\n", (s64a)(cur_buf - buf)); + *scan_end = end; + *state = a4; + return MO_CONTINUE_MATCHING; + }; + if (cur_buf > min_accel_dist && OUTER_ACCEL_FUNC(a4)) { + DEBUG_PRINTF("Accel state reached @ %lli\n", (s64a)(b4 - buf)); + const union AccelAux *aaux = get_accel(s, a4 & SHENG_STATE_MASK); + const u8 *new_offset = run_accel(aaux, cur_buf + 4, end); + if (new_offset < cur_buf + 4 + BAD_ACCEL_DIST) { + min_accel_dist = new_offset + BIG_ACCEL_PENALTY; + } else { + min_accel_dist = new_offset + SMALL_ACCEL_PENALTY; + } + DEBUG_PRINTF("Next accel chance: %llu\n", + (u64a)(min_accel_dist - start)); + DEBUG_PRINTF("Accel scanned %llu bytes\n", + (u64a)(new_offset - cur_buf - 4)); + cur_buf = new_offset; + DEBUG_PRINTF("New offset: %llu\n", (u64a)(cur_buf - buf)); + continue; + }; + cur_buf += 4; + } + *state = movd(cur_state); + *scan_end = cur_buf; + return MO_CONTINUE_MATCHING; +} + +#if defined(HAVE_AVX512VBMI) +static really_inline +char SHENG32_IMPL(u8 *state, NfaCallback cb, void *ctxt, + const struct sheng32 *s, + u8 *const cached_accept_state, + ReportID *const cached_accept_id, + u8 single, u64a base_offset, const u8 *buf, const u8 *start, + const u8 *end, const u8 **scan_end) { + DEBUG_PRINTF("Starting DFAx4 execution in state %u\n", + *state & SHENG32_STATE_MASK); + const u8 *cur_buf = start; + const u8 *min_accel_dist = start; + base_offset++; + DEBUG_PRINTF("Scanning %llu bytes\n", (u64a)(end - start)); + + if (INNER_ACCEL_FUNC32(*state) || OUTER_ACCEL_FUNC32(*state)) { + DEBUG_PRINTF("Accel state reached @ 0\n"); + const union AccelAux *aaux = + get_accel32(s, *state & SHENG32_STATE_MASK); + const u8 *new_offset = run_accel(aaux, cur_buf, end); + if (new_offset < cur_buf + BAD_ACCEL_DIST) { + min_accel_dist = new_offset + BIG_ACCEL_PENALTY; + } else { + min_accel_dist = new_offset + SMALL_ACCEL_PENALTY; + } + DEBUG_PRINTF("Next accel chance: %llu\n", + (u64a)(min_accel_dist - start)); + DEBUG_PRINTF("Accel scanned %zu bytes\n", new_offset - cur_buf); + cur_buf = new_offset; + DEBUG_PRINTF("New offset: %lli\n", (s64a)(cur_buf - start)); + } + if (INNER_DEAD_FUNC32(*state) || OUTER_DEAD_FUNC32(*state)) { + DEBUG_PRINTF("Dead on arrival\n"); + *scan_end = end; + return MO_CONTINUE_MATCHING; + } + + m512 cur_state = set64x8(*state); + const m512 *masks = s->succ_masks; + + while (likely(end - cur_buf >= 4)) { + const u8 *b1 = cur_buf; + const u8 *b2 = cur_buf + 1; + const u8 *b3 = cur_buf + 2; + const u8 *b4 = cur_buf + 3; + const u8 c1 = *b1; + const u8 c2 = *b2; + const u8 c3 = *b3; + const u8 c4 = *b4; + + const m512 succ_mask1 = masks[c1]; + cur_state = vpermb512(cur_state, succ_mask1); + const u8 a1 = movd512(cur_state); + + const m512 succ_mask2 = masks[c2]; + cur_state = vpermb512(cur_state, succ_mask2); + const u8 a2 = movd512(cur_state); + + const m512 succ_mask3 = masks[c3]; + cur_state = vpermb512(cur_state, succ_mask3); + const u8 a3 = movd512(cur_state); + + const m512 succ_mask4 = masks[c4]; + cur_state = vpermb512(cur_state, succ_mask4); + const u8 a4 = movd512(cur_state); + + DEBUG_PRINTF("c: %02hhx '%c'\n", c1, ourisprint(c1) ? c1 : '?'); + DEBUG_PRINTF("s: %u (flag: %u)\n", a1 & SHENG32_STATE_MASK, + a1 & SHENG32_STATE_FLAG_MASK); + + DEBUG_PRINTF("c: %02hhx '%c'\n", c2, ourisprint(c2) ? c2 : '?'); + DEBUG_PRINTF("s: %u (flag: %u)\n", a2 & SHENG32_STATE_MASK, + a2 & SHENG32_STATE_FLAG_MASK); + + DEBUG_PRINTF("c: %02hhx '%c'\n", c3, ourisprint(c3) ? c3 : '?'); + DEBUG_PRINTF("s: %u (flag: %u)\n", a3 & SHENG32_STATE_MASK, + a3 & SHENG32_STATE_FLAG_MASK); + + DEBUG_PRINTF("c: %02hhx '%c'\n", c4, ourisprint(c4) ? c4 : '?'); + DEBUG_PRINTF("s: %u (flag: %u)\n", a4 & SHENG32_STATE_MASK, + a4 & SHENG32_STATE_FLAG_MASK); + + if (unlikely(INTERESTING_FUNC32(a1, a2, a3, a4))) { + if (ACCEPT_FUNC32(a1)) { + u64a match_offset = base_offset + b1 - buf; + DEBUG_PRINTF("Accept state %u reached\n", + a1 & SHENG32_STATE_MASK); + DEBUG_PRINTF("Match @ %llu\n", match_offset); + if (STOP_AT_MATCH) { + DEBUG_PRINTF("Stopping at match @ %lli\n", + (s64a)(b1 - start)); + *scan_end = b1; + *state = a1; + return MO_MATCHES_PENDING; + } + if (single) { + if (fireSingleReport(cb, ctxt, s->report, match_offset) == + MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } else { + if (fireReports32(s, cb, ctxt, a1, match_offset, + cached_accept_state, cached_accept_id, + 0) == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } + } + if (ACCEPT_FUNC32(a2)) { + u64a match_offset = base_offset + b2 - buf; + DEBUG_PRINTF("Accept state %u reached\n", + a2 & SHENG32_STATE_MASK); + DEBUG_PRINTF("Match @ %llu\n", match_offset); + if (STOP_AT_MATCH) { + DEBUG_PRINTF("Stopping at match @ %lli\n", + (s64a)(b2 - start)); + *scan_end = b2; + *state = a2; + return MO_MATCHES_PENDING; + } + if (single) { + if (fireSingleReport(cb, ctxt, s->report, match_offset) == + MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } else { + if (fireReports32(s, cb, ctxt, a2, match_offset, + cached_accept_state, cached_accept_id, + 0) == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } + } + if (ACCEPT_FUNC32(a3)) { + u64a match_offset = base_offset + b3 - buf; + DEBUG_PRINTF("Accept state %u reached\n", + a3 & SHENG32_STATE_MASK); + DEBUG_PRINTF("Match @ %llu\n", match_offset); + if (STOP_AT_MATCH) { + DEBUG_PRINTF("Stopping at match @ %lli\n", + (s64a)(b3 - start)); + *scan_end = b3; + *state = a3; + return MO_MATCHES_PENDING; + } + if (single) { + if (fireSingleReport(cb, ctxt, s->report, match_offset) == + MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } else { + if (fireReports32(s, cb, ctxt, a3, match_offset, + cached_accept_state, cached_accept_id, + 0) == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } + } + if (ACCEPT_FUNC32(a4)) { + u64a match_offset = base_offset + b4 - buf; + DEBUG_PRINTF("Accept state %u reached\n", + a4 & SHENG32_STATE_MASK); + DEBUG_PRINTF("Match @ %llu\n", match_offset); + if (STOP_AT_MATCH) { + DEBUG_PRINTF("Stopping at match @ %lli\n", + (s64a)(b4 - start)); + *scan_end = b4; + *state = a4; + return MO_MATCHES_PENDING; + } + if (single) { + if (fireSingleReport(cb, ctxt, s->report, match_offset) == + MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } else { + if (fireReports32(s, cb, ctxt, a4, match_offset, + cached_accept_state, cached_accept_id, + 0) == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } + } + if (INNER_DEAD_FUNC32(a4)) { + DEBUG_PRINTF("Dead state reached @ %lli\n", (s64a)(b4 - buf)); + *scan_end = end; + *state = a4; + return MO_CONTINUE_MATCHING; + } + if (cur_buf > min_accel_dist && INNER_ACCEL_FUNC32(a4)) { + DEBUG_PRINTF("Accel state reached @ %lli\n", (s64a)(b4 - buf)); + const union AccelAux *aaux = + get_accel32(s, a4 & SHENG32_STATE_MASK); + const u8 *new_offset = run_accel(aaux, cur_buf + 4, end); + if (new_offset < cur_buf + 4 + BAD_ACCEL_DIST) { + min_accel_dist = new_offset + BIG_ACCEL_PENALTY; + } else { + min_accel_dist = new_offset + SMALL_ACCEL_PENALTY; + } + DEBUG_PRINTF("Next accel chance: %llu\n", + (u64a)(min_accel_dist - start)); + DEBUG_PRINTF("Accel scanned %llu bytes\n", + (u64a)(new_offset - cur_buf - 4)); + cur_buf = new_offset; + DEBUG_PRINTF("New offset: %llu\n", (u64a)(cur_buf - buf)); + continue; + } + } + if (OUTER_DEAD_FUNC32(a4)) { + DEBUG_PRINTF("Dead state reached @ %lli\n", (s64a)(cur_buf - buf)); + *scan_end = end; + *state = a4; + return MO_CONTINUE_MATCHING; + }; + if (cur_buf > min_accel_dist && OUTER_ACCEL_FUNC32(a4)) { + DEBUG_PRINTF("Accel state reached @ %lli\n", (s64a)(b4 - buf)); + const union AccelAux *aaux = + get_accel32(s, a4 & SHENG32_STATE_MASK); + const u8 *new_offset = run_accel(aaux, cur_buf + 4, end); + if (new_offset < cur_buf + 4 + BAD_ACCEL_DIST) { + min_accel_dist = new_offset + BIG_ACCEL_PENALTY; + } else { + min_accel_dist = new_offset + SMALL_ACCEL_PENALTY; + } + DEBUG_PRINTF("Next accel chance: %llu\n", + (u64a)(min_accel_dist - start)); + DEBUG_PRINTF("Accel scanned %llu bytes\n", + (u64a)(new_offset - cur_buf - 4)); + cur_buf = new_offset; + DEBUG_PRINTF("New offset: %llu\n", (u64a)(cur_buf - buf)); + continue; + }; + cur_buf += 4; + } + *state = movd512(cur_state); + *scan_end = cur_buf; + return MO_CONTINUE_MATCHING; +} + +#ifndef NO_SHENG64_IMPL +static really_inline +char SHENG64_IMPL(u8 *state, NfaCallback cb, void *ctxt, + const struct sheng64 *s, + u8 *const cached_accept_state, + ReportID *const cached_accept_id, + u8 single, u64a base_offset, const u8 *buf, const u8 *start, + const u8 *end, const u8 **scan_end) { + DEBUG_PRINTF("Starting DFAx4 execution in state %u\n", + *state & SHENG64_STATE_MASK); + const u8 *cur_buf = start; + base_offset++; + DEBUG_PRINTF("Scanning %llu bytes\n", (u64a)(end - start)); + + if (INNER_DEAD_FUNC64(*state) || OUTER_DEAD_FUNC64(*state)) { + DEBUG_PRINTF("Dead on arrival\n"); + *scan_end = end; + return MO_CONTINUE_MATCHING; + } + + m512 cur_state = set64x8(*state); + const m512 *masks = s->succ_masks; + + while (likely(end - cur_buf >= 4)) { + const u8 *b1 = cur_buf; + const u8 *b2 = cur_buf + 1; + const u8 *b3 = cur_buf + 2; + const u8 *b4 = cur_buf + 3; + const u8 c1 = *b1; + const u8 c2 = *b2; + const u8 c3 = *b3; + const u8 c4 = *b4; + + const m512 succ_mask1 = masks[c1]; + cur_state = vpermb512(cur_state, succ_mask1); + const u8 a1 = movd512(cur_state); + + const m512 succ_mask2 = masks[c2]; + cur_state = vpermb512(cur_state, succ_mask2); + const u8 a2 = movd512(cur_state); + + const m512 succ_mask3 = masks[c3]; + cur_state = vpermb512(cur_state, succ_mask3); + const u8 a3 = movd512(cur_state); + + const m512 succ_mask4 = masks[c4]; + cur_state = vpermb512(cur_state, succ_mask4); + const u8 a4 = movd512(cur_state); + + DEBUG_PRINTF("c: %02hhx '%c'\n", c1, ourisprint(c1) ? c1 : '?'); + DEBUG_PRINTF("s: %u (flag: %u)\n", a1 & SHENG64_STATE_MASK, + a1 & SHENG64_STATE_FLAG_MASK); + + DEBUG_PRINTF("c: %02hhx '%c'\n", c2, ourisprint(c2) ? c2 : '?'); + DEBUG_PRINTF("s: %u (flag: %u)\n", a2 & SHENG64_STATE_MASK, + a2 & SHENG64_STATE_FLAG_MASK); + + DEBUG_PRINTF("c: %02hhx '%c'\n", c3, ourisprint(c3) ? c3 : '?'); + DEBUG_PRINTF("s: %u (flag: %u)\n", a3 & SHENG64_STATE_MASK, + a3 & SHENG64_STATE_FLAG_MASK); + + DEBUG_PRINTF("c: %02hhx '%c'\n", c4, ourisprint(c4) ? c4 : '?'); + DEBUG_PRINTF("s: %u (flag: %u)\n", a4 & SHENG64_STATE_MASK, + a4 & SHENG64_STATE_FLAG_MASK); + + if (unlikely(INTERESTING_FUNC64(a1, a2, a3, a4))) { + if (ACCEPT_FUNC64(a1)) { + u64a match_offset = base_offset + b1 - buf; + DEBUG_PRINTF("Accept state %u reached\n", + a1 & SHENG64_STATE_MASK); + DEBUG_PRINTF("Match @ %llu\n", match_offset); + if (STOP_AT_MATCH) { + DEBUG_PRINTF("Stopping at match @ %lli\n", + (s64a)(b1 - start)); + *scan_end = b1; + *state = a1; + return MO_MATCHES_PENDING; + } + if (single) { + if (fireSingleReport(cb, ctxt, s->report, match_offset) == + MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } else { + if (fireReports64(s, cb, ctxt, a1, match_offset, + cached_accept_state, cached_accept_id, + 0) == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } + } + if (ACCEPT_FUNC64(a2)) { + u64a match_offset = base_offset + b2 - buf; + DEBUG_PRINTF("Accept state %u reached\n", + a2 & SHENG64_STATE_MASK); + DEBUG_PRINTF("Match @ %llu\n", match_offset); + if (STOP_AT_MATCH) { + DEBUG_PRINTF("Stopping at match @ %lli\n", + (s64a)(b2 - start)); + *scan_end = b2; + *state = a2; + return MO_MATCHES_PENDING; + } + if (single) { + if (fireSingleReport(cb, ctxt, s->report, match_offset) == + MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } else { + if (fireReports64(s, cb, ctxt, a2, match_offset, + cached_accept_state, cached_accept_id, + 0) == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } + } + if (ACCEPT_FUNC64(a3)) { + u64a match_offset = base_offset + b3 - buf; + DEBUG_PRINTF("Accept state %u reached\n", + a3 & SHENG64_STATE_MASK); + DEBUG_PRINTF("Match @ %llu\n", match_offset); + if (STOP_AT_MATCH) { + DEBUG_PRINTF("Stopping at match @ %lli\n", + (s64a)(b3 - start)); + *scan_end = b3; + *state = a3; + return MO_MATCHES_PENDING; + } + if (single) { + if (fireSingleReport(cb, ctxt, s->report, match_offset) == + MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } else { + if (fireReports64(s, cb, ctxt, a3, match_offset, + cached_accept_state, cached_accept_id, + 0) == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } + } + if (ACCEPT_FUNC64(a4)) { + u64a match_offset = base_offset + b4 - buf; + DEBUG_PRINTF("Accept state %u reached\n", + a4 & SHENG64_STATE_MASK); + DEBUG_PRINTF("Match @ %llu\n", match_offset); + if (STOP_AT_MATCH) { + DEBUG_PRINTF("Stopping at match @ %lli\n", + (s64a)(b4 - start)); + *scan_end = b4; + *state = a4; + return MO_MATCHES_PENDING; + } + if (single) { + if (fireSingleReport(cb, ctxt, s->report, match_offset) == + MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } else { + if (fireReports64(s, cb, ctxt, a4, match_offset, + cached_accept_state, cached_accept_id, + 0) == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } + } + } + if (INNER_DEAD_FUNC64(a4)) { + DEBUG_PRINTF("Dead state reached @ %lli\n", (s64a)(b4 - buf)); + *scan_end = end; + *state = a4; + return MO_CONTINUE_MATCHING; + } + } + if (OUTER_DEAD_FUNC64(a4)) { + DEBUG_PRINTF("Dead state reached @ %lli\n", (s64a)(cur_buf - buf)); + *scan_end = end; + *state = a4; + return MO_CONTINUE_MATCHING; + } + cur_buf += 4; + } + *state = movd512(cur_state); + *scan_end = cur_buf; + return MO_CONTINUE_MATCHING; +} +#endif // !NO_SHENG64_IMPL +#endif diff --git a/regex/nfa/sheng_internal.h b/regex/nfa/sheng_internal.h new file mode 100644 index 000000000..98536886c --- /dev/null +++ b/regex/nfa/sheng_internal.h @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2016-2020, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SHENG_INTERNAL_H_ +#define SHENG_INTERNAL_H_ + +#include "ue2common.h" +#include "util/simd_types.h" + +#define SHENG_STATE_ACCEPT 0x10 +#define SHENG_STATE_DEAD 0x20 +#define SHENG_STATE_ACCEL 0x40 +#define SHENG_STATE_MASK 0xF +#define SHENG_STATE_FLAG_MASK 0x70 + +#define SHENG32_STATE_ACCEPT 0x20 +#define SHENG32_STATE_DEAD 0x40 +#define SHENG32_STATE_ACCEL 0x80 +#define SHENG32_STATE_MASK 0x1F +#define SHENG32_STATE_FLAG_MASK 0xE0 + +#define SHENG64_STATE_ACCEPT 0x40 +#define SHENG64_STATE_DEAD 0x80 +#define SHENG64_STATE_MASK 0x3F +#define SHENG64_STATE_FLAG_MASK 0xC0 + +#define SHENG_FLAG_SINGLE_REPORT 0x1 +#define SHENG_FLAG_CAN_DIE 0x2 +#define SHENG_FLAG_HAS_ACCEL 0x4 + +struct report_list { + u32 count; + ReportID report[]; +}; + +struct sstate_aux { + u32 accept; + u32 accept_eod; + u32 accel; + u32 top; +}; + +struct sheng { + m128 shuffle_masks[256]; + u32 length; + u32 aux_offset; + u32 report_offset; + u32 accel_offset; + u8 n_states; + u8 anchored; + u8 floating; + u8 flags; + ReportID report; +}; + +struct sheng32 { + m512 succ_masks[256]; + u32 length; + u32 aux_offset; + u32 report_offset; + u32 accel_offset; + u8 n_states; + u8 anchored; + u8 floating; + u8 flags; + ReportID report; +}; + +struct sheng64 { + m512 succ_masks[256]; + u32 length; + u32 aux_offset; + u32 report_offset; + u32 accel_offset; + u8 n_states; + u8 anchored; + u8 floating; + u8 flags; + ReportID report; +}; + +#endif /* SHENG_INTERNAL_H_ */ diff --git a/regex/nfa/shufti.c b/regex/nfa/shufti.c new file mode 100644 index 000000000..09ffc0cf9 --- /dev/null +++ b/regex/nfa/shufti.c @@ -0,0 +1,1097 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Shufti: character class acceleration. + * + * Utilises the SSSE3 pshufb shuffle instruction + */ + +#include "shufti.h" +#include "ue2common.h" +#include "util/arch.h" +#include "util/bitutils.h" +#include "util/simd_utils.h" +#include "util/unaligned.h" + +#ifdef DEBUG +#include + +#define DUMP_MSK(_t) \ +static UNUSED \ +void dumpMsk##_t(m##_t msk) { \ + u8 * mskAsU8 = (u8 *)&msk; \ + for (unsigned i = 0; i < sizeof(msk); i++) { \ + u8 c = mskAsU8[i]; \ + for (int j = 0; j < 8; j++) { \ + if ((c >> (7-j)) & 0x1) \ + printf("1"); \ + else \ + printf("0"); \ + } \ + printf(" "); \ + } \ +} \ +static UNUSED \ +void dumpMsk##_t##AsChars(m##_t msk) { \ + u8 * mskAsU8 = (u8 *)&msk; \ + for (unsigned i = 0; i < sizeof(msk); i++) { \ + u8 c = mskAsU8[i]; \ + if (isprint(c)) \ + printf("%c",c); \ + else \ + printf("."); \ + } \ +} + +#endif + +/** \brief Naive byte-by-byte implementation. */ +static really_inline +const u8 *shuftiFwdSlow(const u8 *lo, const u8 *hi, const u8 *buf, + const u8 *buf_end) { + assert(buf < buf_end); + + for (; buf < buf_end; ++buf) { + u8 c = *buf; + if (lo[c & 0xf] & hi[c >> 4]) { + break; + } + } + return buf; +} + +/** \brief Naive byte-by-byte implementation. */ +static really_inline +const u8 *shuftiRevSlow(const u8 *lo, const u8 *hi, const u8 *buf, + const u8 *buf_end) { + assert(buf < buf_end); + + for (buf_end--; buf_end >= buf; buf_end--) { + u8 c = *buf_end; + if (lo[c & 0xf] & hi[c >> 4]) { + break; + } + } + return buf_end; +} + +#if !defined(HAVE_AVX2) +/* Normal SSSE3 shufti */ + +#ifdef DEBUG +DUMP_MSK(128) +#endif + +#define GET_LO_4(chars) and128(chars, low4bits) +#define GET_HI_4(chars) rshift64_m128(andnot128(low4bits, chars), 4) + +static really_inline +u32 block(m128 mask_lo, m128 mask_hi, m128 chars, const m128 low4bits, + const m128 compare) { + m128 c_lo = pshufb_m128(mask_lo, GET_LO_4(chars)); + m128 c_hi = pshufb_m128(mask_hi, GET_HI_4(chars)); + m128 t = and128(c_lo, c_hi); + +#ifdef DEBUG + DEBUG_PRINTF(" chars: "); dumpMsk128AsChars(chars); printf("\n"); + DEBUG_PRINTF(" char: "); dumpMsk128(chars); printf("\n"); + DEBUG_PRINTF(" c_lo: "); dumpMsk128(c_lo); printf("\n"); + DEBUG_PRINTF(" c_hi: "); dumpMsk128(c_hi); printf("\n"); + DEBUG_PRINTF(" t: "); dumpMsk128(t); printf("\n"); +#endif + return movemask128(eq128(t, compare)); +} + +static really_inline +const u8 *firstMatch(const u8 *buf, u32 z) { + if (unlikely(z != 0xffff)) { + u32 pos = ctz32(~z & 0xffff); + assert(pos < 16); + return buf + pos; + } else { + return NULL; // no match + } +} + +static really_inline +const u8 *fwdBlock(m128 mask_lo, m128 mask_hi, m128 chars, const u8 *buf, + const m128 low4bits, const m128 zeroes) { + u32 z = block(mask_lo, mask_hi, chars, low4bits, zeroes); + + return firstMatch(buf, z); +} + +const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, + const u8 *buf_end) { + assert(buf && buf_end); + assert(buf < buf_end); + + // Slow path for small cases. + if (buf_end - buf < 16) { + return shuftiFwdSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi, + buf, buf_end); + } + + const m128 zeroes = zeroes128(); + const m128 low4bits = _mm_set1_epi8(0xf); + const u8 *rv; + + size_t min = (size_t)buf % 16; + assert(buf_end - buf >= 16); + + // Preconditioning: most of the time our buffer won't be aligned. + m128 chars = loadu128(buf); + rv = fwdBlock(mask_lo, mask_hi, chars, buf, low4bits, zeroes); + if (rv) { + return rv; + } + buf += (16 - min); + + // Unrolling was here, but it wasn't doing anything but taking up space. + // Reroll FTW. + + const u8 *last_block = buf_end - 16; + while (buf < last_block) { + m128 lchars = load128(buf); + rv = fwdBlock(mask_lo, mask_hi, lchars, buf, low4bits, zeroes); + if (rv) { + return rv; + } + buf += 16; + } + + // Use an unaligned load to mop up the last 16 bytes and get an accurate + // picture to buf_end. + assert(buf <= buf_end && buf >= buf_end - 16); + chars = loadu128(buf_end - 16); + rv = fwdBlock(mask_lo, mask_hi, chars, buf_end - 16, low4bits, zeroes); + if (rv) { + return rv; + } + + return buf_end; +} + +static really_inline +const u8 *lastMatch(const u8 *buf, m128 t, m128 compare) { +#ifdef DEBUG + DEBUG_PRINTF("confirming match in:"); dumpMsk128(t); printf("\n"); +#endif + + u32 z = movemask128(eq128(t, compare)); + if (unlikely(z != 0xffff)) { + u32 pos = clz32(~z & 0xffff); + DEBUG_PRINTF("buf=%p, pos=%u\n", buf, pos); + assert(pos >= 16 && pos < 32); + return buf + (31 - pos); + } else { + return NULL; // no match + } +} + + +static really_inline +const u8 *revBlock(m128 mask_lo, m128 mask_hi, m128 chars, const u8 *buf, + const m128 low4bits, const m128 zeroes) { + m128 c_lo = pshufb_m128(mask_lo, GET_LO_4(chars)); + m128 c_hi = pshufb_m128(mask_hi, GET_HI_4(chars)); + m128 t = and128(c_lo, c_hi); + +#ifdef DEBUG + DEBUG_PRINTF(" chars: "); dumpMsk128AsChars(chars); printf("\n"); + DEBUG_PRINTF(" char: "); dumpMsk128(chars); printf("\n"); + DEBUG_PRINTF(" c_lo: "); dumpMsk128(c_lo); printf("\n"); + DEBUG_PRINTF(" c_hi: "); dumpMsk128(c_hi); printf("\n"); + DEBUG_PRINTF(" t: "); dumpMsk128(t); printf("\n"); +#endif + + return lastMatch(buf, t, zeroes); +} + +const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, + const u8 *buf_end) { + assert(buf && buf_end); + assert(buf < buf_end); + + // Slow path for small cases. + if (buf_end - buf < 16) { + return shuftiRevSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi, + buf, buf_end); + } + + const m128 zeroes = zeroes128(); + const m128 low4bits = _mm_set1_epi8(0xf); + const u8 *rv; + + assert(buf_end - buf >= 16); + + // Preconditioning: most of the time our buffer won't be aligned. + m128 chars = loadu128(buf_end - 16); + rv = revBlock(mask_lo, mask_hi, chars, buf_end - 16, low4bits, zeroes); + if (rv) { + return rv; + } + buf_end = (const u8 *)((size_t)buf_end & ~((size_t)0xf)); + + // Unrolling was here, but it wasn't doing anything but taking up space. + // Reroll FTW. + + const u8 *last_block = buf + 16; + while (buf_end > last_block) { + buf_end -= 16; + m128 lchars = load128(buf_end); + rv = revBlock(mask_lo, mask_hi, lchars, buf_end, low4bits, zeroes); + if (rv) { + return rv; + } + } + + // Use an unaligned load to mop up the last 16 bytes and get an accurate + // picture to buf. + chars = loadu128(buf); + rv = revBlock(mask_lo, mask_hi, chars, buf, low4bits, zeroes); + if (rv) { + return rv; + } + + return buf - 1; +} + +static really_inline +const u8 *fwdBlock2(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo, m128 mask2_hi, + m128 chars, const u8 *buf, const m128 low4bits, + const m128 ones) { + m128 chars_lo = GET_LO_4(chars); + m128 chars_hi = GET_HI_4(chars); + m128 c_lo = pshufb_m128(mask1_lo, chars_lo); + m128 c_hi = pshufb_m128(mask1_hi, chars_hi); + m128 t = or128(c_lo, c_hi); + +#ifdef DEBUG + DEBUG_PRINTF(" chars: "); dumpMsk128AsChars(chars); printf("\n"); + DEBUG_PRINTF(" char: "); dumpMsk128(chars); printf("\n"); + DEBUG_PRINTF(" c_lo: "); dumpMsk128(c_lo); printf("\n"); + DEBUG_PRINTF(" c_hi: "); dumpMsk128(c_hi); printf("\n"); + DEBUG_PRINTF(" t: "); dumpMsk128(t); printf("\n"); +#endif + + m128 c2_lo = pshufb_m128(mask2_lo, chars_lo); + m128 c2_hi = pshufb_m128(mask2_hi, chars_hi); + m128 t2 = or128(t, rshiftbyte_m128(or128(c2_lo, c2_hi), 1)); + +#ifdef DEBUG + DEBUG_PRINTF(" c2_lo: "); dumpMsk128(c2_lo); printf("\n"); + DEBUG_PRINTF(" c2_hi: "); dumpMsk128(c2_hi); printf("\n"); + DEBUG_PRINTF(" t2: "); dumpMsk128(t2); printf("\n"); +#endif + + u32 z = movemask128(eq128(t2, ones)); + DEBUG_PRINTF(" z: 0x%08x\n", z); + return firstMatch(buf, z); +} + +const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi, + m128 mask2_lo, m128 mask2_hi, + const u8 *buf, const u8 *buf_end) { + const m128 ones = ones128(); + const m128 low4bits = _mm_set1_epi8(0xf); + const u8 *rv; + + size_t min = (size_t)buf % 16; + + // Preconditioning: most of the time our buffer won't be aligned. + m128 chars = loadu128(buf); + rv = fwdBlock2(mask1_lo, mask1_hi, mask2_lo, mask2_hi, + chars, buf, low4bits, ones); + if (rv) { + return rv; + } + buf += (16 - min); + + // Unrolling was here, but it wasn't doing anything but taking up space. + // Reroll FTW. + + const u8 *last_block = buf_end - 16; + while (buf < last_block) { + m128 lchars = load128(buf); + rv = fwdBlock2(mask1_lo, mask1_hi, mask2_lo, mask2_hi, + lchars, buf, low4bits, ones); + if (rv) { + return rv; + } + buf += 16; + } + + // Use an unaligned load to mop up the last 16 bytes and get an accurate + // picture to buf_end. + chars = loadu128(buf_end - 16); + rv = fwdBlock2(mask1_lo, mask1_hi, mask2_lo, mask2_hi, + chars, buf_end - 16, low4bits, ones); + if (rv) { + return rv; + } + + return buf_end; +} + +#elif !defined(HAVE_AVX512) +// AVX2 - 256 wide shuftis + +#ifdef DEBUG +DUMP_MSK(256) +#endif + +#define GET_LO_4(chars) and256(chars, low4bits) +#define GET_HI_4(chars) rshift64_m256(andnot256(low4bits, chars), 4) + +static really_inline +u32 block(m256 mask_lo, m256 mask_hi, m256 chars, const m256 low4bits, + const m256 compare) { + m256 c_lo = pshufb_m256(mask_lo, GET_LO_4(chars)); + m256 c_hi = pshufb_m256(mask_hi, GET_HI_4(chars)); + m256 t = and256(c_lo, c_hi); + +#ifdef DEBUG + DEBUG_PRINTF(" chars: "); dumpMsk256AsChars(chars); printf("\n"); + DEBUG_PRINTF(" char: "); dumpMsk256(chars); printf("\n"); + DEBUG_PRINTF(" c_lo: "); dumpMsk256(c_lo); printf("\n"); + DEBUG_PRINTF(" c_hi: "); dumpMsk256(c_hi); printf("\n"); + DEBUG_PRINTF(" t: "); dumpMsk256(t); printf("\n"); +#endif + + return movemask256(eq256(t, compare)); +} + +static really_inline +const u8 *firstMatch(const u8 *buf, u32 z) { + DEBUG_PRINTF("z 0x%08x\n", z); + if (unlikely(z != 0xffffffff)) { + u32 pos = ctz32(~z); + assert(pos < 32); + DEBUG_PRINTF("match @ pos %u\n", pos); + return buf + pos; + } else { + return NULL; // no match + } +} + +static really_inline +const u8 *fwdBlockShort(m256 mask, m128 chars, const u8 *buf, + const m256 low4bits) { + // do the hi and lo shuffles in the one avx register + m256 c = combine2x128(rshift64_m128(chars, 4), chars); + c = and256(c, low4bits); + m256 c_shuf = pshufb_m256(mask, c); + m128 t = and128(movdq_hi(c_shuf), cast256to128(c_shuf)); + // the upper 32-bits can't match + u32 z = 0xffff0000U | movemask128(eq128(t, zeroes128())); + + return firstMatch(buf, z); +} + +static really_inline +const u8 *shuftiFwdShort(m128 mask_lo, m128 mask_hi, const u8 *buf, + const u8 *buf_end, const m256 low4bits) { + // run shufti over two overlapping 16-byte unaligned reads + const m256 mask = combine2x128(mask_hi, mask_lo); + m128 chars = loadu128(buf); + const u8 *rv = fwdBlockShort(mask, chars, buf, low4bits); + if (rv) { + return rv; + } + + chars = loadu128(buf_end - 16); + rv = fwdBlockShort(mask, chars, buf_end - 16, low4bits); + if (rv) { + return rv; + } + return buf_end; +} + +static really_inline +const u8 *fwdBlock(m256 mask_lo, m256 mask_hi, m256 chars, const u8 *buf, + const m256 low4bits, const m256 zeroes) { + u32 z = block(mask_lo, mask_hi, chars, low4bits, zeroes); + + return firstMatch(buf, z); +} + +/* takes 128 bit masks, but operates on 256 bits of data */ +const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, + const u8 *buf_end) { + assert(buf && buf_end); + assert(buf < buf_end); + DEBUG_PRINTF("shufti %p len %zu\n", buf, buf_end - buf); + + // Slow path for small cases. + if (buf_end - buf < 16) { + return shuftiFwdSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi, + buf, buf_end); + } + + const m256 low4bits = set32x8(0xf); + + if (buf_end - buf <= 32) { + return shuftiFwdShort(mask_lo, mask_hi, buf, buf_end, low4bits); + } + + const m256 zeroes = zeroes256(); + const m256 wide_mask_lo = set2x128(mask_lo); + const m256 wide_mask_hi = set2x128(mask_hi); + const u8 *rv; + + size_t min = (size_t)buf % 32; + assert(buf_end - buf >= 32); + + // Preconditioning: most of the time our buffer won't be aligned. + m256 chars = loadu256(buf); + rv = fwdBlock(wide_mask_lo, wide_mask_hi, chars, buf, low4bits, zeroes); + if (rv) { + return rv; + } + buf += (32 - min); + + // Unrolling was here, but it wasn't doing anything but taking up space. + // Reroll FTW. + + const u8 *last_block = buf_end - 32; + while (buf < last_block) { + m256 lchars = load256(buf); + rv = fwdBlock(wide_mask_lo, wide_mask_hi, lchars, buf, low4bits, zeroes); + if (rv) { + return rv; + } + buf += 32; + } + + // Use an unaligned load to mop up the last 32 bytes and get an accurate + // picture to buf_end. + assert(buf <= buf_end && buf >= buf_end - 32); + chars = loadu256(buf_end - 32); + rv = fwdBlock(wide_mask_lo, wide_mask_hi, chars, buf_end - 32, low4bits, zeroes); + if (rv) { + return rv; + } + + return buf_end; +} + +static really_inline +const u8 *lastMatch(const u8 *buf, u32 z) { + if (unlikely(z != 0xffffffff)) { + u32 pos = clz32(~z); + DEBUG_PRINTF("buf=%p, pos=%u\n", buf, pos); + return buf + (31 - pos); + } else { + return NULL; // no match + } +} + +static really_inline +const u8 *revBlock(m256 mask_lo, m256 mask_hi, m256 chars, const u8 *buf, + const m256 low4bits, const m256 zeroes) { + m256 c_lo = pshufb_m256(mask_lo, GET_LO_4(chars)); + m256 c_hi = pshufb_m256(mask_hi, GET_HI_4(chars)); + m256 t = and256(c_lo, c_hi); + +#ifdef DEBUG + DEBUG_PRINTF(" chars: "); dumpMsk256AsChars(chars); printf("\n"); + DEBUG_PRINTF(" char: "); dumpMsk256(chars); printf("\n"); + DEBUG_PRINTF(" c_lo: "); dumpMsk256(c_lo); printf("\n"); + DEBUG_PRINTF(" c_hi: "); dumpMsk256(c_hi); printf("\n"); + DEBUG_PRINTF(" t: "); dumpMsk256(t); printf("\n"); +#endif + + u32 z = movemask256(eq256(t, zeroes)); + return lastMatch(buf, z); +} + +static really_inline +const u8 *revBlockShort(m256 mask, m128 chars, const u8 *buf, + const m256 low4bits) { + // do the hi and lo shuffles in the one avx register + m256 c = combine2x128(rshift64_m128(chars, 4), chars); + c = and256(c, low4bits); + m256 c_shuf = pshufb_m256(mask, c); + m128 t = and128(movdq_hi(c_shuf), cast256to128(c_shuf)); + // the upper 32-bits can't match + u32 z = 0xffff0000U | movemask128(eq128(t, zeroes128())); + + return lastMatch(buf, z); +} + +static really_inline +const u8 *shuftiRevShort(m128 mask_lo, m128 mask_hi, const u8 *buf, + const u8 *buf_end, const m256 low4bits) { + // run shufti over two overlapping 16-byte unaligned reads + const m256 mask = combine2x128(mask_hi, mask_lo); + + m128 chars = loadu128(buf_end - 16); + const u8 *rv = revBlockShort(mask, chars, buf_end - 16, low4bits); + if (rv) { + return rv; + } + + chars = loadu128(buf); + rv = revBlockShort(mask, chars, buf, low4bits); + if (rv) { + return rv; + } + return buf - 1; +} + + +/* takes 128 bit masks, but operates on 256 bits of data */ +const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, + const u8 *buf_end) { + assert(buf && buf_end); + assert(buf < buf_end); + + // Slow path for small cases. + if (buf_end - buf < 16) { + return shuftiRevSlow((const u8 *)&mask_lo, (const u8 *)&mask_hi, + buf, buf_end); + } + + const m256 low4bits = set32x8(0xf); + + if (buf_end - buf <= 32) { + return shuftiRevShort(mask_lo, mask_hi, buf, buf_end, low4bits); + } + + const m256 zeroes = zeroes256(); + const m256 wide_mask_lo = set2x128(mask_lo); + const m256 wide_mask_hi = set2x128(mask_hi); + const u8 *rv; + + assert(buf_end - buf >= 32); + + // Preconditioning: most of the time our buffer won't be aligned. + m256 chars = loadu256(buf_end - 32); + rv = revBlock(wide_mask_lo, wide_mask_hi, chars, buf_end - 32, low4bits, zeroes); + if (rv) { + return rv; + } + buf_end = (const u8 *)((size_t)buf_end & ~((size_t)0x1f)); + + // Unrolling was here, but it wasn't doing anything but taking up space. + // Reroll FTW. + const u8 *last_block = buf + 32; + while (buf_end > last_block) { + buf_end -= 32; + m256 lchars = load256(buf_end); + rv = revBlock(wide_mask_lo, wide_mask_hi, lchars, buf_end, low4bits, zeroes); + if (rv) { + return rv; + } + } + + // Use an unaligned load to mop up the last 32 bytes and get an accurate + // picture to buf. + chars = loadu256(buf); + rv = revBlock(wide_mask_lo, wide_mask_hi, chars, buf, low4bits, zeroes); + if (rv) { + return rv; + } + + return buf - 1; +} + +static really_inline +const u8 *fwdBlock2(m256 mask1_lo, m256 mask1_hi, m256 mask2_lo, m256 mask2_hi, + m256 chars, const u8 *buf, const m256 low4bits, + const m256 ones) { + DEBUG_PRINTF("buf %p\n", buf); + m256 chars_lo = GET_LO_4(chars); + m256 chars_hi = GET_HI_4(chars); + m256 c_lo = pshufb_m256(mask1_lo, chars_lo); + m256 c_hi = pshufb_m256(mask1_hi, chars_hi); + m256 t = or256(c_lo, c_hi); + +#ifdef DEBUG + DEBUG_PRINTF(" chars: "); dumpMsk256AsChars(chars); printf("\n"); + DEBUG_PRINTF(" char: "); dumpMsk256(chars); printf("\n"); + DEBUG_PRINTF(" c_lo: "); dumpMsk256(c_lo); printf("\n"); + DEBUG_PRINTF(" c_hi: "); dumpMsk256(c_hi); printf("\n"); + DEBUG_PRINTF(" t: "); dumpMsk256(t); printf("\n"); +#endif + + m256 c2_lo = pshufb_m256(mask2_lo, chars_lo); + m256 c2_hi = pshufb_m256(mask2_hi, chars_hi); + m256 t2 = or256(t, rshift128_m256(or256(c2_lo, c2_hi), 1)); + +#ifdef DEBUG + DEBUG_PRINTF(" c2_lo: "); dumpMsk256(c2_lo); printf("\n"); + DEBUG_PRINTF(" c2_hi: "); dumpMsk256(c2_hi); printf("\n"); + DEBUG_PRINTF(" t2: "); dumpMsk256(t2); printf("\n"); +#endif + u32 z = movemask256(eq256(t2, ones)); + + return firstMatch(buf, z); +} + +static really_inline +const u8 *fwdBlockShort2(m256 mask1, m256 mask2, m128 chars, const u8 *buf, + const m256 low4bits) { + // do the hi and lo shuffles in the one avx register + m256 c = combine2x128(rshift64_m128(chars, 4), chars); + c = and256(c, low4bits); + m256 c_shuf1 = pshufb_m256(mask1, c); + m256 c_shuf2 = rshift128_m256(pshufb_m256(mask2, c), 1); + m256 t0 = or256(c_shuf1, c_shuf2); + m128 t = or128(movdq_hi(t0), cast256to128(t0)); + // the upper 32-bits can't match + u32 z = 0xffff0000U | movemask128(eq128(t, ones128())); + + return firstMatch(buf, z); +} + +static really_inline +const u8 *shuftiDoubleShort(m128 mask1_lo, m128 mask1_hi, m128 mask2_lo, + m128 mask2_hi, const u8 *buf, const u8 *buf_end) { + DEBUG_PRINTF("buf %p len %zu\n", buf, buf_end - buf); + const m256 low4bits = set32x8(0xf); + // run shufti over two overlapping 16-byte unaligned reads + const m256 mask1 = combine2x128(mask1_hi, mask1_lo); + const m256 mask2 = combine2x128(mask2_hi, mask2_lo); + m128 chars = loadu128(buf); + const u8 *rv = fwdBlockShort2(mask1, mask2, chars, buf, low4bits); + if (rv) { + return rv; + } + + chars = loadu128(buf_end - 16); + rv = fwdBlockShort2(mask1, mask2, chars, buf_end - 16, low4bits); + if (rv) { + return rv; + } + return buf_end; +} + +/* takes 128 bit masks, but operates on 256 bits of data */ +const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi, + m128 mask2_lo, m128 mask2_hi, + const u8 *buf, const u8 *buf_end) { + /* we should always have at least 16 bytes */ + assert(buf_end - buf >= 16); + DEBUG_PRINTF("buf %p len %zu\n", buf, buf_end - buf); + + if (buf_end - buf < 32) { + return shuftiDoubleShort(mask1_lo, mask1_hi, mask2_lo, mask2_hi, buf, + buf_end); + } + + const m256 ones = ones256(); + const m256 low4bits = set32x8(0xf); + const m256 wide_mask1_lo = set2x128(mask1_lo); + const m256 wide_mask1_hi = set2x128(mask1_hi); + const m256 wide_mask2_lo = set2x128(mask2_lo); + const m256 wide_mask2_hi = set2x128(mask2_hi); + const u8 *rv; + + size_t min = (size_t)buf % 32; + + // Preconditioning: most of the time our buffer won't be aligned. + m256 chars = loadu256(buf); + rv = fwdBlock2(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, wide_mask2_hi, + chars, buf, low4bits, ones); + if (rv) { + return rv; + } + buf += (32 - min); + + // Unrolling was here, but it wasn't doing anything but taking up space. + // Reroll FTW. + const u8 *last_block = buf_end - 32; + while (buf < last_block) { + m256 lchars = load256(buf); + rv = fwdBlock2(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, wide_mask2_hi, + lchars, buf, low4bits, ones); + if (rv) { + return rv; + } + buf += 32; + } + + // Use an unaligned load to mop up the last 32 bytes and get an accurate + // picture to buf_end. + chars = loadu256(buf_end - 32); + rv = fwdBlock2(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, wide_mask2_hi, + chars, buf_end - 32, low4bits, ones); + if (rv) { + return rv; + } + + return buf_end; +} + +#else // defined(HAVE_AVX512) + +#ifdef DEBUG +DUMP_MSK(512) +#endif + +static really_inline +u64a block(m512 mask_lo, m512 mask_hi, m512 chars, const m512 low4bits, + const m512 compare) { + m512 c_lo = pshufb_m512(mask_lo, and512(chars, low4bits)); + m512 c_hi = pshufb_m512(mask_hi, + rshift64_m512(andnot512(low4bits, chars), 4)); + m512 t = and512(c_lo, c_hi); + +#ifdef DEBUG + DEBUG_PRINTF(" chars: "); dumpMsk512AsChars(chars); printf("\n"); + DEBUG_PRINTF(" char: "); dumpMsk512(chars); printf("\n"); + DEBUG_PRINTF(" c_lo: "); dumpMsk512(c_lo); printf("\n"); + DEBUG_PRINTF(" c_hi: "); dumpMsk512(c_hi); printf("\n"); + DEBUG_PRINTF(" t: "); dumpMsk512(t); printf("\n"); +#endif + + return eq512mask(t, compare); +} +static really_inline +const u8 *firstMatch64(const u8 *buf, u64a z) { + DEBUG_PRINTF("z 0x%016llx\n", z); + if (unlikely(z != ~0ULL)) { + u32 pos = ctz64(~z); + DEBUG_PRINTF("match @ pos %u\n", pos); + assert(pos < 64); + return buf + pos; + } else { + return NULL; // no match + } +} + +static really_inline +const u8 *fwdBlock512(m512 mask_lo, m512 mask_hi, m512 chars, const u8 *buf, + const m512 low4bits, const m512 zeroes) { + u64a z = block(mask_lo, mask_hi, chars, low4bits, zeroes); + + return firstMatch64(buf, z); +} + +static really_inline +const u8 *shortShufti512(m512 mask_lo, m512 mask_hi, const u8 *buf, + const u8 *buf_end, const m512 low4bits, + const m512 zeroes) { + DEBUG_PRINTF("short shufti %p len %zu\n", buf, buf_end - buf); + uintptr_t len = buf_end - buf; + assert(len <= 64); + + // load mask + u64a k = (~0ULL) >> (64 - len); + DEBUG_PRINTF("load mask 0x%016llx\n", k); + + m512 chars = loadu_maskz_m512(k, buf); + + u64a z = block(mask_lo, mask_hi, chars, low4bits, zeroes); + + // reuse the load mask to indicate valid bytes + return firstMatch64(buf, z | ~k); +} + +/* takes 128 bit masks, but operates on 512 bits of data */ +const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, + const u8 *buf_end) { + assert(buf && buf_end); + assert(buf < buf_end); + DEBUG_PRINTF("shufti %p len %zu\n", buf, buf_end - buf); + DEBUG_PRINTF("b %s\n", buf); + + const m512 low4bits = set64x8(0xf); + const m512 zeroes = zeroes512(); + const m512 wide_mask_lo = set4x128(mask_lo); + const m512 wide_mask_hi = set4x128(mask_hi); + const u8 *rv; + + // small cases. + if (buf_end - buf <= 64) { + rv = shortShufti512(wide_mask_lo, wide_mask_hi, buf, buf_end, low4bits, + zeroes); + return rv ? rv : buf_end; + } + + assert(buf_end - buf >= 64); + + // Preconditioning: most of the time our buffer won't be aligned. + if ((uintptr_t)buf % 64) { + rv = shortShufti512(wide_mask_lo, wide_mask_hi, buf, + ROUNDUP_PTR(buf, 64), low4bits, zeroes); + if (rv) { + return rv; + } + buf = ROUNDUP_PTR(buf, 64); + } + + const u8 *last_block = ROUNDDOWN_PTR(buf_end, 64); + while (buf < last_block) { + m512 lchars = load512(buf); + rv = fwdBlock512(wide_mask_lo, wide_mask_hi, lchars, buf, low4bits, + zeroes); + if (rv) { + return rv; + } + buf += 64; + } + + if (buf == buf_end) { + goto done; + } + + // Use an unaligned load to mop up the last 64 bytes and get an accurate + // picture to buf_end. + assert(buf <= buf_end && buf >= buf_end - 64); + m512 chars = loadu512(buf_end - 64); + rv = fwdBlock512(wide_mask_lo, wide_mask_hi, chars, buf_end - 64, low4bits, + zeroes); + if (rv) { + return rv; + } +done: + return buf_end; +} + +static really_inline +const u8 *lastMatch64(const u8 *buf, u64a z) { + DEBUG_PRINTF("z 0x%016llx\n", z); + if (unlikely(z != ~0ULL)) { + u32 pos = clz64(~z); + DEBUG_PRINTF("buf=%p, pos=%u\n", buf, pos); + return buf + (63 - pos); + } else { + return NULL; // no match + } +} + +static really_inline +const u8 *rshortShufti512(m512 mask_lo, m512 mask_hi, const u8 *buf, + const u8 *buf_end, const m512 low4bits, + const m512 zeroes) { + DEBUG_PRINTF("short %p len %zu\n", buf, buf_end - buf); + uintptr_t len = buf_end - buf; + assert(len <= 64); + + // load mask + u64a k = (~0ULL) >> (64 - len); + DEBUG_PRINTF("load mask 0x%016llx\n", k); + + m512 chars = loadu_maskz_m512(k, buf); + + u64a z = block(mask_lo, mask_hi, chars, low4bits, zeroes); + + // reuse the load mask to indicate valid bytes + return lastMatch64(buf, z | ~k); +} + +static really_inline +const u8 *revBlock512(m512 mask_lo, m512 mask_hi, m512 chars, const u8 *buf, + const m512 low4bits, const m512 zeroes) { + m512 c_lo = pshufb_m512(mask_lo, and512(chars, low4bits)); + m512 c_hi = pshufb_m512(mask_hi, + rshift64_m512(andnot512(low4bits, chars), 4)); + m512 t = and512(c_lo, c_hi); + +#ifdef DEBUG + DEBUG_PRINTF(" chars: "); dumpMsk512AsChars(chars); printf("\n"); + DEBUG_PRINTF(" char: "); dumpMsk512(chars); printf("\n"); + DEBUG_PRINTF(" c_lo: "); dumpMsk512(c_lo); printf("\n"); + DEBUG_PRINTF(" c_hi: "); dumpMsk512(c_hi); printf("\n"); + DEBUG_PRINTF(" t: "); dumpMsk512(t); printf("\n"); +#endif + + u64a z = eq512mask(t, zeroes); + return lastMatch64(buf, z); +} + +/* takes 128 bit masks, but operates on 512 bits of data */ +const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, + const u8 *buf_end) { + DEBUG_PRINTF("buf %p buf_end %p\n", buf, buf_end); + assert(buf && buf_end); + assert(buf < buf_end); + + const m512 low4bits = set64x8(0xf); + const m512 zeroes = zeroes512(); + const m512 wide_mask_lo = set4x128(mask_lo); + const m512 wide_mask_hi = set4x128(mask_hi); + const u8 *rv; + + if (buf_end - buf < 64) { + rv = rshortShufti512(wide_mask_lo, wide_mask_hi, buf, buf_end, low4bits, + zeroes); + return rv ? rv : buf - 1; + } + + if (ROUNDDOWN_PTR(buf_end, 64) != buf_end) { + // peel off unaligned portion + assert(buf_end - buf >= 64); + DEBUG_PRINTF("start\n"); + rv = rshortShufti512(wide_mask_lo, wide_mask_hi, + ROUNDDOWN_PTR(buf_end, 64), buf_end, low4bits, + zeroes); + if (rv) { + return rv; + } + buf_end = ROUNDDOWN_PTR(buf_end, 64); + } + + const u8 *last_block = ROUNDUP_PTR(buf, 64); + while (buf_end > last_block) { + buf_end -= 64; + m512 lchars = load512(buf_end); + rv = revBlock512(wide_mask_lo, wide_mask_hi, lchars, buf_end, low4bits, + zeroes); + if (rv) { + return rv; + } + } + if (buf_end == buf) { + goto done; + } + // Use an unaligned load to mop up the last 64 bytes and get an accurate + // picture to buf. + m512 chars = loadu512(buf); + rv = revBlock512(wide_mask_lo, wide_mask_hi, chars, buf, low4bits, zeroes); + if (rv) { + return rv; + } +done: + return buf - 1; +} + +static really_inline +const u8 *fwdBlock2(m512 mask1_lo, m512 mask1_hi, m512 mask2_lo, m512 mask2_hi, + m512 chars, const u8 *buf, const m512 low4bits, + const m512 ones, __mmask64 k) { + DEBUG_PRINTF("buf %p %.64s\n", buf, buf); + m512 chars_lo = and512(chars, low4bits); + m512 chars_hi = rshift64_m512(andnot512(low4bits, chars), 4); + m512 c_lo = maskz_pshufb_m512(k, mask1_lo, chars_lo); + m512 c_hi = maskz_pshufb_m512(k, mask1_hi, chars_hi); + m512 t = or512(c_lo, c_hi); + +#ifdef DEBUG + DEBUG_PRINTF(" chars: "); dumpMsk512AsChars(chars); printf("\n"); + DEBUG_PRINTF(" char: "); dumpMsk512(chars); printf("\n"); + DEBUG_PRINTF(" c_lo: "); dumpMsk512(c_lo); printf("\n"); + DEBUG_PRINTF(" c_hi: "); dumpMsk512(c_hi); printf("\n"); + DEBUG_PRINTF(" t: "); dumpMsk512(t); printf("\n"); +#endif + + m512 c2_lo = maskz_pshufb_m512(k, mask2_lo, chars_lo); + m512 c2_hi = maskz_pshufb_m512(k, mask2_hi, chars_hi); + m512 t2 = or512(t, rshift128_m512(or512(c2_lo, c2_hi), 1)); + +#ifdef DEBUG + DEBUG_PRINTF(" c2_lo: "); dumpMsk512(c2_lo); printf("\n"); + DEBUG_PRINTF(" c2_hi: "); dumpMsk512(c2_hi); printf("\n"); + DEBUG_PRINTF(" t2: "); dumpMsk512(t2); printf("\n"); +#endif + u64a z = eq512mask(t2, ones); + + return firstMatch64(buf, z | ~k); +} + +static really_inline +const u8 *shortDoubleShufti512(m512 mask1_lo, m512 mask1_hi, m512 mask2_lo, + m512 mask2_hi, const u8 *buf, const u8 *buf_end, + const m512 low4bits, const m512 ones) { + DEBUG_PRINTF("short %p len %zu\n", buf, buf_end - buf); + uintptr_t len = buf_end - buf; + assert(len <= 64); + + u64a k = (~0ULL) >> (64 - len); + DEBUG_PRINTF("load mask 0x%016llx\n", k); + + m512 chars = loadu_mask_m512(ones, k, buf); + + const u8 *rv = fwdBlock2(mask1_lo, mask1_hi, mask2_lo, mask2_hi, chars, buf, + low4bits, ones, k); + + return rv; +} + +/* takes 128 bit masks, but operates on 512 bits of data */ +const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi, + m128 mask2_lo, m128 mask2_hi, + const u8 *buf, const u8 *buf_end) { + /* we should always have at least 16 bytes */ + assert(buf_end - buf >= 16); + DEBUG_PRINTF("buf %p len %zu\n", buf, buf_end - buf); + + const m512 ones = ones512(); + const m512 low4bits = set64x8(0xf); + const m512 wide_mask1_lo = set4x128(mask1_lo); + const m512 wide_mask1_hi = set4x128(mask1_hi); + const m512 wide_mask2_lo = set4x128(mask2_lo); + const m512 wide_mask2_hi = set4x128(mask2_hi); + const u8 *rv; + + if (buf_end - buf <= 64) { + rv = shortDoubleShufti512(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, + wide_mask2_hi, buf, buf_end, low4bits, ones); + DEBUG_PRINTF("rv %p\n", rv); + return rv ? rv : buf_end; + } + + // Preconditioning: most of the time our buffer won't be aligned. + if ((uintptr_t)buf % 64) { + rv = shortDoubleShufti512(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, + wide_mask2_hi, buf, ROUNDUP_PTR(buf, 64), + low4bits, ones); + if (rv) { + return rv; + } + + buf = ROUNDUP_PTR(buf, 64); + } + + const u8 *last_block = buf_end - 64; + while (buf < last_block) { + m512 lchars = load512(buf); + rv = fwdBlock2(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, + wide_mask2_hi, lchars, buf, low4bits, ones, ~0); + if (rv) { + return rv; + } + buf += 64; + } + + // Use an unaligned load to mop up the last 64 bytes and get an accurate + // picture to buf_end. + m512 chars = loadu512(buf_end - 64); + rv = fwdBlock2(wide_mask1_lo, wide_mask1_hi, wide_mask2_lo, wide_mask2_hi, + chars, buf_end - 64, low4bits, ones, ~0); + if (rv) { + return rv; + } + + return buf_end; +} +#endif diff --git a/regex/nfa/shufti.h b/regex/nfa/shufti.h new file mode 100644 index 000000000..1ebf776cc --- /dev/null +++ b/regex/nfa/shufti.h @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2015, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Shufti: character class acceleration. + * + * Utilises the SSSE3 pshufb shuffle instruction + */ + +#ifndef SHUFTI_H +#define SHUFTI_H + +#include "ue2common.h" +#include "util/simd_utils.h" + +#ifdef __cplusplus +extern "C" +{ +#endif + +const u8 *shuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, + const u8 *buf_end); + +// Returns (buf - 1) if not found. +const u8 *rshuftiExec(m128 mask_lo, m128 mask_hi, const u8 *buf, + const u8 *buf_end); + +const u8 *shuftiDoubleExec(m128 mask1_lo, m128 mask1_hi, + m128 mask2_lo, m128 mask2_hi, + const u8 *buf, const u8 *buf_end); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/regex/nfa/tamarama.c b/regex/nfa/tamarama.c new file mode 100644 index 000000000..43480f065 --- /dev/null +++ b/regex/nfa/tamarama.c @@ -0,0 +1,441 @@ +/* + * Copyright (c) 2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + \brief Tamarama: container engine for exclusive engines, runtime code. +*/ +#include "config.h" + +#include "tamarama.h" + +#include "tamarama_internal.h" +#include "nfa_api.h" +#include "nfa_api_queue.h" +#include "nfa_api_util.h" +#include "nfa_internal.h" +#include "scratch.h" +#include "util/partial_store.h" + +static really_inline +u32 getSubOffset(const struct Tamarama *t, u32 num) { + DEBUG_PRINTF("subengine:%u\n", num); + assert(num < t->numSubEngines); + const u32 *sub = + (const u32 *)((const char *)t + sizeof(struct Tamarama) + + t->numSubEngines * sizeof(u32)); + assert(ISALIGNED(sub)); + return sub[num]; +} + +static +const struct NFA *getSubEngine(const struct Tamarama *t, + const u32 activeIdx) { + const u32 offset = getSubOffset(t, activeIdx); + DEBUG_PRINTF("activeIdx:%u offsets:%u\n", activeIdx, offset); + const char *base = (const char *)t; + return (const struct NFA *)(base + offset); +} + +static +void storeActiveIdx(const struct Tamarama *t, char *state, + const u32 idx) { + assert(idx <= t->numSubEngines); + partial_store_u32(state, idx, t->activeIdxSize); +} + +static +u32 loadActiveIdx(const char *state, + const u32 activeIdxSize) { + return partial_load_u32(state, activeIdxSize); +} + +static really_inline +void copyQueueProperties(const struct mq *q1, struct mq *q2, + const u32 activeIdxSize) { + q2->state = q1->state; + q2->streamState = q1->streamState + activeIdxSize; + q2->offset = q1->offset; + q2->buffer = q1->buffer; + q2->length = q1->length; + q2->history = q1->history; + q2->hlength = q1->hlength; + q2->cb = q1->cb; + q2->context = q1->context; + q2->scratch = q1->scratch; + q2->report_current = q1->report_current; +} + +static +void copyQueueItems(const struct Tamarama *t, const struct NFA *sub, + struct mq *q1, struct mq *q2, const u32 activeIdx) { + const u32 *baseTop = (const u32 *)((const char *)t + + sizeof(struct Tamarama)); + + u32 lower = baseTop[activeIdx]; + u32 upper = activeIdx == t->numSubEngines - 1 ? + ~0U : baseTop[activeIdx + 1]; + u32 event_base = isMultiTopType(sub->type) ? MQE_TOP_FIRST : MQE_TOP; + while (q1->cur < q1->end) { + u32 type = q1->items[q1->cur].type; + s64a loc = q1->items[q1->cur].location; + DEBUG_PRINTF("type:%u lower:%u upper:%u\n", type, lower, upper); + if (type >= lower && type < upper) { + u32 event = event_base; + if (event == MQE_TOP_FIRST) { + event += type - lower; + } + pushQueue(q2, event, loc); + } else { + pushQueueNoMerge(q2, MQE_END, loc); + break; + } + q1->cur++; + } +} + +static +void copyQueue(const struct Tamarama *t, const struct NFA *sub, + struct mq *q1, struct mq *q2, const u32 activeIdx) { + copyQueueProperties(q1, q2, t->activeIdxSize); + + // copy MQE_START item + u32 cur = q1->cur++; + q2->cur = cur; + q2->items[cur] = q1->items[cur]; + q2->end = cur + 1; + + copyQueueItems(t, sub, q1, q2, activeIdx); + // restore cur index of the main queue + q1->cur = cur; +} + +static +u32 findEngineForTop(const u32 *baseTop, const u32 cur, + const u32 numSubEngines) { + u32 i; + for (i = 0; i < numSubEngines; ++i) { + DEBUG_PRINTF("cur:%u base:%u\n", cur, baseTop[i]); + if (cur >= baseTop[i] && + (i == numSubEngines - 1 || cur < baseTop[i + 1])) { + break; + } + } + return i; +} + +static +void initSubQueue(const struct Tamarama *t, struct mq *q1, + struct mq *q2, const u32 lastActiveIdx, + const u32 activeIdx) { + // Push events to the new queue + const struct NFA *sub = getSubEngine(t, activeIdx); + assert(!isContainerType(sub->type)); + q2->nfa = sub; + + // Reinitialize state if the last active subengine is different + // from current one + if (lastActiveIdx == t->numSubEngines || + lastActiveIdx != activeIdx) { + nfaQueueInitState(q2->nfa, q2); + } + + copyQueueItems(t, sub, q1, q2, activeIdx); + if (q1->items[q1->cur].type == MQE_END) { + q1->cur++; + } + DEBUG_PRINTF("update lastIdx:%u\n", activeIdx); + storeActiveIdx(t, q1->streamState, activeIdx); +} + +static +void updateQueues(const struct Tamarama *t, struct mq *q1, struct mq *q2) { + q2->cur = q2->end = 0; + copyQueueProperties(q1, q2, t->activeIdxSize); + + const u32 numSubEngines = t->numSubEngines; + u32 lastActiveIdx = loadActiveIdx(q1->streamState, + t->activeIdxSize); +#ifdef DEBUG + DEBUG_PRINTF("external queue\n"); + debugQueue(q1); +#endif + + // Push MQE_START event to the subqueue + s64a loc = q1->items[q1->cur].location; + pushQueueAt(q2, 0, MQE_START, loc); + char hasStart = 0; + if (q1->items[q1->cur].type == MQE_START) { + hasStart = 1; + q1->cur++; + } + + u32 activeIdx = lastActiveIdx; + // If we have top events in the main queue, update current active id + if (q1->cur < q1->end - 1) { + const u32 *baseTop = (const u32 *)((const char *)t + + sizeof(struct Tamarama)); + u32 curTop = q1->items[q1->cur].type; + activeIdx = findEngineForTop(baseTop, curTop, numSubEngines); + } + + assert(activeIdx < numSubEngines); + DEBUG_PRINTF("last id:%u, current id:%u, num of subengines:%u\n", + lastActiveIdx, activeIdx, numSubEngines); + // Handle unfinished last alive subengine + if (lastActiveIdx != activeIdx && + lastActiveIdx != numSubEngines && hasStart) { + loc = q1->items[q1->cur].location; + pushQueueNoMerge(q2, MQE_END, loc); + q2->nfa = getSubEngine(t, lastActiveIdx); + return; + } + + initSubQueue(t, q1, q2, lastActiveIdx, activeIdx); + DEBUG_PRINTF("finish queues\n"); +} + +// After processing subqueue items for subengines, we need to copy back +// remaining items in subqueue if there are any to Tamarama main queue +static +void copyBack(const struct Tamarama *t, struct mq *q, struct mq *q1) { + DEBUG_PRINTF("copy back %u, %u\n", q1->cur, q1->end); + q->report_current = q1->report_current; + if (q->cur >= q->end && q1->cur >= q1->end) { + return; + } + + const u32 *baseTop = (const u32 *)((const char *)t + + sizeof(struct Tamarama)); + const u32 lastIdx = loadActiveIdx(q->streamState, + t->activeIdxSize); + u32 base = 0, event_base = 0; + if (lastIdx != t->numSubEngines) { + base = baseTop[lastIdx]; + const struct NFA *sub = getSubEngine(t, lastIdx); + event_base = isMultiTopType(sub->type) ? MQE_TOP_FIRST : MQE_TOP; + } + + u32 numItems = q1->end > q1->cur + 1 ? q1->end - q1->cur - 1 : 1; + // Also need to copy MQE_END if the main queue is empty + if (q->cur == q->end) { + assert(q->cur > 1 && q1->items[q1->end - 1].type == MQE_END); + q->items[--q->cur] = q1->items[q1->end - 1]; + } + u32 cur = q->cur - numItems; + q->items[cur] = q1->items[q1->cur++]; + q->items[cur].type = MQE_START; + q->cur = cur++; + for (u32 i = 0; i < numItems - 1; ++i) { + assert(q1->cur < q1->end); + u32 type = q1->items[q1->cur].type; + if (type > MQE_END) { + q1->items[q1->cur].type = type - event_base + base; + } + q->items[cur++] = q1->items[q1->cur++]; + } + +#ifdef DEBUG + DEBUG_PRINTF("external queue\n"); + debugQueue(q); +#endif +} + +char nfaExecTamarama_testEOD(const struct NFA *n, const char *state, + const char *streamState, u64a offset, + NfaCallback callback, void *context) { + const struct Tamarama *t = getImplNfa(n); + u32 activeIdx = loadActiveIdx(streamState, t->activeIdxSize); + if (activeIdx == t->numSubEngines) { + return MO_CONTINUE_MATCHING; + } + + const struct NFA *sub = getSubEngine(t, activeIdx); + if (nfaAcceptsEod(sub)) { + assert(!isContainerType(sub->type)); + const char *subStreamState = streamState + t->activeIdxSize; + return nfaCheckFinalState(sub, state, subStreamState, offset, callback, + context); + } + + return MO_CONTINUE_MATCHING; +} + +char nfaExecTamarama_QR(const struct NFA *n, struct mq *q, ReportID report) { + DEBUG_PRINTF("exec rose\n"); + struct mq q1; + q1.cur = q1.end = 0; + char rv = 0; + const struct Tamarama *t = getImplNfa(n); + while (q->cur < q->end) { + updateQueues(t, q, &q1); + } + + if (q1.cur < q1.end) { + rv = nfaQueueExecRose(q1.nfa, &q1, report); + } + + DEBUG_PRINTF("exec rose rv:%u\n", rv); + return rv; +} + +char nfaExecTamarama_reportCurrent(const struct NFA *n, struct mq *q) { + const struct Tamarama *t = getImplNfa(n); + u32 activeIdx = loadActiveIdx(q->streamState, t->activeIdxSize); + if (activeIdx == t->numSubEngines) { + return 1; + } + + const struct NFA *sub = getSubEngine(t, activeIdx); + struct mq q1; + copyQueue(t, sub, q, &q1, activeIdx); + return nfaReportCurrentMatches(sub, &q1); +} + +char nfaExecTamarama_inAccept(const struct NFA *n, ReportID report, + struct mq *q) { + const struct Tamarama *t = getImplNfa(n); + u32 activeIdx = loadActiveIdx(q->streamState, t->activeIdxSize); + if (activeIdx == t->numSubEngines) { + return 0; + } + const struct NFA *sub = getSubEngine(t, activeIdx); + + struct mq q1; + copyQueue(t, sub, q, &q1, activeIdx); + return nfaInAcceptState(sub, report, &q1); +} + +char nfaExecTamarama_inAnyAccept(const struct NFA *n, struct mq *q) { + const struct Tamarama *t = getImplNfa(n); + u32 activeIdx = loadActiveIdx(q->streamState, t->activeIdxSize); + if (activeIdx == t->numSubEngines) { + return 0; + } + const struct NFA *sub = getSubEngine(t, activeIdx); + + struct mq q1; + copyQueue(t, sub, q, &q1, activeIdx); + return nfaInAnyAcceptState(sub, &q1); +} + +char nfaExecTamarama_queueInitState(const struct NFA *n, struct mq *q) { + DEBUG_PRINTF("init state\n"); + const struct Tamarama *t = getImplNfa(n); + char *ptr = q->streamState; + // Use activeIdxSize as a sentinel value and initialize the state to + // an invalid engine as nothing has been triggered yet + storeActiveIdx(t, ptr, t->numSubEngines); + return 0; +} + +char nfaExecTamarama_queueCompressState(const struct NFA *n, const struct mq *q, + s64a loc) { + const struct Tamarama *t = getImplNfa(n); + u32 activeIdx = loadActiveIdx(q->streamState, t->activeIdxSize); + if (activeIdx == t->numSubEngines) { + return 0; + } + + const struct NFA *sub = getSubEngine(t, activeIdx); + + struct mq q1; + copyQueueProperties(q, &q1, t->activeIdxSize); + return nfaQueueCompressState(sub, &q1, loc); +} + +char nfaExecTamarama_expandState(const struct NFA *n, void *dest, + const void *src, u64a offset, u8 key) { + const struct Tamarama *t = getImplNfa(n); + u32 activeIdx = loadActiveIdx(src, t->activeIdxSize); + if (activeIdx == t->numSubEngines) { + return 0; + } + + const struct NFA *sub = getSubEngine(t, activeIdx); + + const char *subStreamState = (const char *)src + t->activeIdxSize; + return nfaExpandState(sub, dest, subStreamState, offset, key); +} + +enum nfa_zombie_status nfaExecTamarama_zombie_status(const struct NFA *n, + struct mq *q, s64a loc) { + const struct Tamarama *t = getImplNfa(n); + u32 activeIdx = loadActiveIdx(q->streamState, t->activeIdxSize); + if (activeIdx == t->numSubEngines) { + return NFA_ZOMBIE_NO; + } + const struct NFA *sub = getSubEngine(t, activeIdx); + + struct mq q1; + copyQueue(t, sub, q, &q1, activeIdx); + return nfaGetZombieStatus(sub, &q1, loc); +} + +char nfaExecTamarama_Q(const struct NFA *n, struct mq *q, s64a end) { + DEBUG_PRINTF("exec\n"); + struct mq q1; + char rv = MO_ALIVE; + char copy = 0; + const struct Tamarama *t = getImplNfa(n); + while (q->cur < q->end && q_cur_loc(q) <= end) { + updateQueues(t, q, &q1); + rv = nfaQueueExec_raw(q1.nfa, &q1, end); + q->report_current = q1.report_current; + copy = 1; + if (can_stop_matching(q->scratch)) { + break; + } + } + if (copy) { + copyBack(t, q, &q1); + } + return rv; +} + +char nfaExecTamarama_Q2(const struct NFA *n, struct mq *q, s64a end) { + DEBUG_PRINTF("exec to match\n"); + struct mq q1; + char rv = 0; + char copy = 0; + const struct Tamarama *t = getImplNfa(n); + while (q->cur < q->end && q_cur_loc(q) <= end && + rv != MO_MATCHES_PENDING) { + updateQueues(t, q, &q1); + rv = nfaQueueExec2_raw(q1.nfa, &q1, end); + q->report_current = q1.report_current; + copy = 1; + if (can_stop_matching(q->scratch)) { + break; + } + } + if (copy) { + copyBack(t, q, &q1); + } + return rv; +} + diff --git a/regex/nfa/tamarama.h b/regex/nfa/tamarama.h new file mode 100644 index 000000000..3b52d8de7 --- /dev/null +++ b/regex/nfa/tamarama.h @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef TAMARAMA_H +#define TAMARAMA_H + +#ifdef __cplusplus +extern "C" +{ +#endif + +#include "callback.h" +#include "ue2common.h" + +struct mq; +struct NFA; +struct hs_scratch; + +char nfaExecTamarama_testEOD(const struct NFA *n, const char *state, + const char *streamState, u64a offset, + NfaCallback callback, void *context); +char nfaExecTamarama_QR(const struct NFA *n, struct mq *q, ReportID report); +char nfaExecTamarama_reportCurrent(const struct NFA *n, struct mq *q); +char nfaExecTamarama_inAccept(const struct NFA *n, ReportID report, + struct mq *q); +char nfaExecTamarama_inAnyAccept(const struct NFA *n, struct mq *q); +char nfaExecTamarama_queueInitState(const struct NFA *n, struct mq *q); +char nfaExecTamarama_queueCompressState(const struct NFA *n, const struct mq *q, + s64a loc); +char nfaExecTamarama_expandState(const struct NFA *n, void *dest, + const void *src, u64a offset, u8 key); +enum nfa_zombie_status nfaExecTamarama_zombie_status(const struct NFA *n, + struct mq *q, s64a loc); +char nfaExecTamarama_Q(const struct NFA *nfa, struct mq *q, s64a end); +char nfaExecTamarama_Q2(const struct NFA *nfa, struct mq *q, s64a end); + +// only used by outfix and miracles, no implementation for tamarama +#define nfaExecTamarama_initCompressedState NFA_API_NO_IMPL +#define nfaExecTamarama_B_Reverse NFA_API_NO_IMPL + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/regex/nfa/tamarama_internal.h b/regex/nfa/tamarama_internal.h new file mode 100644 index 000000000..5cdc70d40 --- /dev/null +++ b/regex/nfa/tamarama_internal.h @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + *\brief Tamarama: container engine for exclusive engines, + * data structures. + */ + +/* Tamarama bytecode layout: + * * |-----| + * * | | struct NFA + * * |-----| + * * | | struct Tamarama + * * | | + * * |-----| + * * | | top remapping table: + * * | | stores top base for each subengine. + * * | | old_top = remapped_top - top_base; + * * | | The size of table is equal to the number of subengines. + * * ... + * * | | + * * |-----| + * * | | offsets from the start of struct Tamarama to subengines --\ + * * ... | + * * | | -----------\ | + * * |-----| | | + * * ||--| | subengine 1 (struct NFA + rest of subengine) <--/ | + * * || | | | + * * ||--| | | + * * || | | | + * * || | | | + * * ||--| | | + * * | | | + * * ||--| | subengine 2 (struct NFA + rest of subengine) <-------/ + * * || | | + * * ||--| | + * * || | | + * * || | | + * * ||--| | + * * | | + * * ... + * * | | + * * |-----| total size of tamarama + * * + * * Tamarama stream state: + * * + * * |---| + * * | | active subengine id + * * |---| + * * | | common pool of stream state for each engine + * * | | + * * | | + * * ... + * * | | + * * | | + * * |---| + * * + * * Tamarama scratch space: + * * + * * |---| + * * | | common pool of scratch for each engine + * * | | + * * | | + * * ... + * * | | + * * | | + * * |---| + * */ + +#ifndef NFA_TAMARAMA_INTERNAL_H +#define NFA_TAMARAMA_INTERNAL_H + +#include "ue2common.h" + +struct ALIGN_AVX_DIRECTIVE Tamarama { + u32 numSubEngines; + u8 activeIdxSize; +}; + +#endif // NFA_TAMARAMA_INTERNAL_H diff --git a/regex/nfa/truffle.c b/regex/nfa/truffle.c new file mode 100644 index 000000000..be6b312cf --- /dev/null +++ b/regex/nfa/truffle.c @@ -0,0 +1,608 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Matches a byte in a charclass using three shuffles + */ + + +#include "ue2common.h" +#include "truffle.h" +#include "util/arch.h" +#include "util/bitutils.h" +#include "util/simd_utils.h" + +#if !defined(HAVE_AVX2) + +static really_inline +const u8 *lastMatch(const u8 *buf, u32 z) { + if (unlikely(z != 0xffff)) { + u32 pos = clz32(~z & 0xffff); + assert(pos >= 16 && pos < 32); + return buf + (31 - pos); + } + + return NULL; // no match +} + +static really_inline +const u8 *firstMatch(const u8 *buf, u32 z) { + if (unlikely(z != 0xffff)) { + u32 pos = ctz32(~z & 0xffff); + assert(pos < 16); + return buf + pos; + } + + return NULL; // no match +} + +static really_inline +u32 block(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, m128 v) { + + m128 highconst = _mm_set1_epi8(0x80); + m128 shuf_mask_hi = _mm_set1_epi64x(0x8040201008040201); + + // and now do the real work + m128 shuf1 = pshufb_m128(shuf_mask_lo_highclear, v); + m128 t1 = xor128(v, highconst); + m128 shuf2 = pshufb_m128(shuf_mask_lo_highset, t1); + m128 t2 = andnot128(highconst, rshift64_m128(v, 4)); + m128 shuf3 = pshufb_m128(shuf_mask_hi, t2); + m128 tmp = and128(or128(shuf1, shuf2), shuf3); + m128 tmp2 = eq128(tmp, zeroes128()); + u32 z = movemask128(tmp2); + + return z; +} + +static +const u8 *truffleMini(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, + const u8 *buf, const u8 *buf_end) { + uintptr_t len = buf_end - buf; + assert(len < 16); + + m128 chars = zeroes128(); + memcpy(&chars, buf, len); + + u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars); + // can't be these bytes in z + u32 mask = (0xffff >> (16 - len)) ^ 0xffff; + const u8 *rv = firstMatch(buf, z | mask); + + if (rv) { + return rv; + } else { + return buf_end; + } +} + +static really_inline +const u8 *fwdBlock(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, + m128 v, const u8 *buf) { + u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, v); + return firstMatch(buf, z); +} + +static really_inline +const u8 *revBlock(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, + m128 v, const u8 *buf) { + u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, v); + return lastMatch(buf, z); +} + +const u8 *truffleExec(m128 shuf_mask_lo_highclear, + m128 shuf_mask_lo_highset, + const u8 *buf, const u8 *buf_end) { + DEBUG_PRINTF("len %zu\n", buf_end - buf); + + assert(buf && buf_end); + assert(buf < buf_end); + const u8 *rv; + + if (buf_end - buf < 16) { + return truffleMini(shuf_mask_lo_highclear, shuf_mask_lo_highset, buf, + buf_end); + } + + size_t min = (size_t)buf % 16; + assert(buf_end - buf >= 16); + + // Preconditioning: most of the time our buffer won't be aligned. + m128 chars = loadu128(buf); + rv = fwdBlock(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars, buf); + if (rv) { + return rv; + } + buf += (16 - min); + + const u8 *last_block = buf_end - 16; + while (buf < last_block) { + m128 lchars = load128(buf); + rv = fwdBlock(shuf_mask_lo_highclear, shuf_mask_lo_highset, lchars, + buf); + if (rv) { + return rv; + } + buf += 16; + } + + // Use an unaligned load to mop up the last 16 bytes and get an accurate + // picture to buf_end. + assert(buf <= buf_end && buf >= buf_end - 16); + chars = loadu128(buf_end - 16); + rv = fwdBlock(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars, + buf_end - 16); + if (rv) { + return rv; + } + + return buf_end; +} + +static +const u8 *truffleRevMini(m128 shuf_mask_lo_highclear, + m128 shuf_mask_lo_highset, const u8 *buf, + const u8 *buf_end) { + uintptr_t len = buf_end - buf; + assert(len < 16); + + m128 chars = zeroes128(); + memcpy(&chars, buf, len); + + u32 mask = (0xffff >> (16 - len)) ^ 0xffff; + u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars); + const u8 *rv = lastMatch(buf, z | mask); + + if (rv) { + return rv; + } + return buf - 1; +} + +const u8 *rtruffleExec(m128 shuf_mask_lo_highclear, + m128 shuf_mask_lo_highset, + const u8 *buf, const u8 *buf_end) { + assert(buf && buf_end); + assert(buf < buf_end); + const u8 *rv; + + DEBUG_PRINTF("len %zu\n", buf_end - buf); + + if (buf_end - buf < 16) { + return truffleRevMini(shuf_mask_lo_highclear, shuf_mask_lo_highset, buf, + buf_end); + } + + assert(buf_end - buf >= 16); + + // Preconditioning: most of the time our buffer won't be aligned. + m128 chars = loadu128(buf_end - 16); + rv = revBlock(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars, + buf_end - 16); + if (rv) { + return rv; + } + buf_end = (const u8 *)((size_t)buf_end & ~((size_t)0xf)); + + const u8 *last_block = buf + 16; + while (buf_end > last_block) { + buf_end -= 16; + m128 lchars = load128(buf_end); + rv = revBlock(shuf_mask_lo_highclear, shuf_mask_lo_highset, lchars, + buf_end); + if (rv) { + return rv; + } + } + + // Use an unaligned load to mop up the last 16 bytes and get an accurate + // picture to buf_end. + chars = loadu128(buf); + rv = revBlock(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars, buf); + if (rv) { + return rv; + } + + return buf - 1; +} + +#elif !defined(HAVE_AVX512) + +// AVX2 + +static really_inline +const u8 *lastMatch(const u8 *buf, u32 z) { + if (unlikely(z != 0xffffffff)) { + u32 pos = clz32(~z); + assert(pos < 32); + return buf + (31 - pos); + } + + return NULL; // no match +} + +static really_inline +const u8 *firstMatch(const u8 *buf, u32 z) { + if (unlikely(z != 0xffffffff)) { + u32 pos = ctz32(~z); + assert(pos < 32); + return buf + pos; + } + + return NULL; // no match +} + +static really_inline +u32 block(m256 shuf_mask_lo_highclear, m256 shuf_mask_lo_highset, m256 v) { + + m256 highconst = _mm256_set1_epi8(0x80); + m256 shuf_mask_hi = _mm256_set1_epi64x(0x8040201008040201); + + // and now do the real work + m256 shuf1 = pshufb_m256(shuf_mask_lo_highclear, v); + m256 t1 = xor256(v, highconst); + m256 shuf2 = pshufb_m256(shuf_mask_lo_highset, t1); + m256 t2 = andnot256(highconst, rshift64_m256(v, 4)); + m256 shuf3 = pshufb_m256(shuf_mask_hi, t2); + m256 tmp = and256(or256(shuf1, shuf2), shuf3); + m256 tmp2 = eq256(tmp, zeroes256()); + u32 z = movemask256(tmp2); + + return z; +} + +static +const u8 *truffleMini(m256 shuf_mask_lo_highclear, m256 shuf_mask_lo_highset, + const u8 *buf, const u8 *buf_end) { + uintptr_t len = buf_end - buf; + assert(len < 32); + + m256 chars = zeroes256(); + memcpy(&chars, buf, len); + + u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars); + // can't be these bytes in z + u32 mask = (0xffffffff >> (32 - len)) ^ 0xffffffff; + const u8 *rv = firstMatch(buf, z | mask); + + if (rv) { + return rv; + } else { + return buf_end; + } +} + +static really_inline +const u8 *fwdBlock(m256 shuf_mask_lo_highclear, m256 shuf_mask_lo_highset, + m256 v, const u8 *buf) { + u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, v); + return firstMatch(buf, z); +} + +static really_inline +const u8 *revBlock(m256 shuf_mask_lo_highclear, m256 shuf_mask_lo_highset, + m256 v, const u8 *buf) { + u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, v); + return lastMatch(buf, z); +} + +const u8 *truffleExec(m128 shuf_mask_lo_highclear, + m128 shuf_mask_lo_highset, + const u8 *buf, const u8 *buf_end) { + DEBUG_PRINTF("len %zu\n", buf_end - buf); + const m256 wide_clear = set2x128(shuf_mask_lo_highclear); + const m256 wide_set = set2x128(shuf_mask_lo_highset); + + assert(buf && buf_end); + assert(buf < buf_end); + const u8 *rv; + + if (buf_end - buf < 32) { + return truffleMini(wide_clear, wide_set, buf, buf_end); + } + + size_t min = (size_t)buf % 32; + assert(buf_end - buf >= 32); + + // Preconditioning: most of the time our buffer won't be aligned. + m256 chars = loadu256(buf); + rv = fwdBlock(wide_clear, wide_set, chars, buf); + if (rv) { + return rv; + } + buf += (32 - min); + + const u8 *last_block = buf_end - 32; + while (buf < last_block) { + m256 lchars = load256(buf); + rv = fwdBlock(wide_clear, wide_set, lchars, buf); + if (rv) { + return rv; + } + buf += 32; + } + + // Use an unaligned load to mop up the last 32 bytes and get an accurate + // picture to buf_end. + assert(buf <= buf_end && buf >= buf_end - 32); + chars = loadu256(buf_end - 32); + rv = fwdBlock(wide_clear, wide_set, chars, buf_end - 32); + if (rv) { + return rv; + } + return buf_end; +} + +static +const u8 *truffleRevMini(m256 shuf_mask_lo_highclear, + m256 shuf_mask_lo_highset, const u8 *buf, + const u8 *buf_end) { + uintptr_t len = buf_end - buf; + assert(len < 32); + + m256 chars = zeroes256(); + memcpy(&chars, buf, len); + + u32 mask = (0xffffffff >> (32 - len)) ^ 0xffffffff; + u32 z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars); + const u8 *rv = lastMatch(buf, z | mask); + + if (rv) { + return rv; + } + return buf - 1; +} + + +const u8 *rtruffleExec(m128 shuf_mask_lo_highclear, + m128 shuf_mask_lo_highset, + const u8 *buf, const u8 *buf_end) { + const m256 wide_clear = set2x128(shuf_mask_lo_highclear); + const m256 wide_set = set2x128(shuf_mask_lo_highset); + assert(buf && buf_end); + assert(buf < buf_end); + const u8 *rv; + + DEBUG_PRINTF("len %zu\n", buf_end - buf); + + if (buf_end - buf < 32) { + return truffleRevMini(wide_clear, wide_set, buf, buf_end); + } + + assert(buf_end - buf >= 32); + + // Preconditioning: most of the time our buffer won't be aligned. + m256 chars = loadu256(buf_end - 32); + rv = revBlock(wide_clear, wide_set, chars, + buf_end - 32); + if (rv) { + return rv; + } + buf_end = (const u8 *)((size_t)buf_end & ~((size_t)0x1f)); + + const u8 *last_block = buf + 32; + while (buf_end > last_block) { + buf_end -= 32; + m256 lchars = load256(buf_end); + rv = revBlock(wide_clear, wide_set, lchars, buf_end); + if (rv) { + return rv; + } + } + + // Use an unaligned load to mop up the last 32 bytes and get an accurate + // picture to buf_end. + chars = loadu256(buf); + rv = revBlock(wide_clear, wide_set, chars, buf); + if (rv) { + return rv; + } + return buf - 1; +} + +#else // AVX512 + +static really_inline +const u8 *lastMatch(const u8 *buf, u64a z) { + if (unlikely(z != ~0ULL)) { + u64a pos = clz64(~z); + assert(pos < 64); + return buf + (63 - pos); + } + + return NULL; // no match +} + +static really_inline +const u8 *firstMatch(const u8 *buf, u64a z) { + if (unlikely(z != ~0ULL)) { + u64a pos = ctz64(~z); + assert(pos < 64); + DEBUG_PRINTF("pos %llu\n", pos); + return buf + pos; + } + + return NULL; // no match +} + +static really_inline +u64a block(m512 shuf_mask_lo_highclear, m512 shuf_mask_lo_highset, m512 v) { + m512 highconst = set64x8(0x80); + m512 shuf_mask_hi = set8x64(0x8040201008040201); + + // and now do the real work + m512 shuf1 = pshufb_m512(shuf_mask_lo_highclear, v); + m512 t1 = xor512(v, highconst); + m512 shuf2 = pshufb_m512(shuf_mask_lo_highset, t1); + m512 t2 = andnot512(highconst, rshift64_m512(v, 4)); + m512 shuf3 = pshufb_m512(shuf_mask_hi, t2); + m512 tmp = and512(or512(shuf1, shuf2), shuf3); + u64a z = eq512mask(tmp, zeroes512()); + + return z; +} + +static really_inline +const u8 *truffleMini(m512 shuf_mask_lo_highclear, m512 shuf_mask_lo_highset, + const u8 *buf, const u8 *buf_end) { + uintptr_t len = buf_end - buf; + assert(len <= 64); + + __mmask64 mask = (~0ULL) >> (64 - len); + + m512 chars = loadu_maskz_m512(mask, buf); + + u64a z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars); + + const u8 *rv = firstMatch(buf, z | ~mask); + + return rv; +} + +static really_inline +const u8 *fwdBlock(m512 shuf_mask_lo_highclear, m512 shuf_mask_lo_highset, + m512 v, const u8 *buf) { + u64a z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, v); + return firstMatch(buf, z); +} + +static really_inline +const u8 *revBlock(m512 shuf_mask_lo_highclear, m512 shuf_mask_lo_highset, + m512 v, const u8 *buf) { + u64a z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, v); + return lastMatch(buf, z); +} + +const u8 *truffleExec(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, + const u8 *buf, const u8 *buf_end) { + DEBUG_PRINTF("len %zu\n", buf_end - buf); + const m512 wide_clear = set4x128(shuf_mask_lo_highclear); + const m512 wide_set = set4x128(shuf_mask_lo_highset); + + assert(buf && buf_end); + assert(buf < buf_end); + const u8 *rv; + + if (buf_end - buf <= 64) { + rv = truffleMini(wide_clear, wide_set, buf, buf_end); + return rv ? rv : buf_end; + } + + assert(buf_end - buf >= 64); + if ((uintptr_t)buf % 64) { + // Preconditioning: most of the time our buffer won't be aligned. + rv = truffleMini(wide_clear, wide_set, buf, ROUNDUP_PTR(buf, 64)); + if (rv) { + return rv; + } + buf = ROUNDUP_PTR(buf, 64); + } + const u8 *last_block = buf_end - 64; + while (buf < last_block) { + m512 lchars = load512(buf); + rv = fwdBlock(wide_clear, wide_set, lchars, buf); + if (rv) { + return rv; + } + buf += 64; + } + + // Use an unaligned load to mop up the last 64 bytes and get an accurate + // picture to buf_end. + assert(buf <= buf_end && buf >= buf_end - 64); + m512 chars = loadu512(buf_end - 64); + rv = fwdBlock(wide_clear, wide_set, chars, buf_end - 64); + if (rv) { + return rv; + } + return buf_end; +} + +static really_inline +const u8 *truffleRevMini(m512 shuf_mask_lo_highclear, m512 shuf_mask_lo_highset, + const u8 *buf, const u8 *buf_end) { + uintptr_t len = buf_end - buf; + assert(len < 64); + + __mmask64 mask = (~0ULL) >> (64 - len); + m512 chars = loadu_maskz_m512(mask, buf); + u64a z = block(shuf_mask_lo_highclear, shuf_mask_lo_highset, chars); + DEBUG_PRINTF("mask 0x%016llx z 0x%016llx\n", mask, z); + const u8 *rv = lastMatch(buf, z | ~mask); + + if (rv) { + return rv; + } + return buf - 1; +} + +const u8 *rtruffleExec(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, + const u8 *buf, const u8 *buf_end) { + const m512 wide_clear = set4x128(shuf_mask_lo_highclear); + const m512 wide_set = set4x128(shuf_mask_lo_highset); + assert(buf && buf_end); + assert(buf < buf_end); + const u8 *rv; + + DEBUG_PRINTF("len %zu\n", buf_end - buf); + + if (buf_end - buf < 64) { + return truffleRevMini(wide_clear, wide_set, buf, buf_end); + } + + assert(buf_end - buf >= 64); + + // Preconditioning: most of the time our buffer won't be aligned. + m512 chars = loadu512(buf_end - 64); + rv = revBlock(wide_clear, wide_set, chars, buf_end - 64); + if (rv) { + return rv; + } + buf_end = (const u8 *)ROUNDDOWN_N((uintptr_t)buf_end, 64); + + const u8 *last_block = buf + 64; + while (buf_end > last_block) { + buf_end -= 64; + m512 lchars = load512(buf_end); + rv = revBlock(wide_clear, wide_set, lchars, buf_end); + if (rv) { + return rv; + } + } + + // Use an unaligned load to mop up the last 64 bytes and get an accurate + // picture to buf_end. + chars = loadu512(buf); + rv = revBlock(wide_clear, wide_set, chars, buf); + if (rv) { + return rv; + } + return buf - 1; +} + +#endif diff --git a/regex/nfa/truffle.h b/regex/nfa/truffle.h new file mode 100644 index 000000000..f67227ad1 --- /dev/null +++ b/regex/nfa/truffle.h @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2015, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Truffle: fully general character class acceleration. + * + * Utilises the SSSE3 pshufb or AVX2 vpshufb shuffle instructions + */ + +#ifndef TRUFFLE_H +#define TRUFFLE_H + +#include "util/simd_types.h" + +#ifdef __cplusplus +extern "C" +{ +#endif + +const u8 *truffleExec(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, + const u8 *buf, const u8 *buf_end); + +const u8 *rtruffleExec(m128 shuf_mask_lo_highclear, m128 shuf_mask_lo_highset, + const u8 *buf, const u8 *buf_end); + +#ifdef __cplusplus +} +#endif + + +#endif /* TRUFFLE_H */ + diff --git a/regex/nfa/vermicelli.h b/regex/nfa/vermicelli.h new file mode 100644 index 000000000..ed797d83f --- /dev/null +++ b/regex/nfa/vermicelli.h @@ -0,0 +1,518 @@ +/* + * Copyright (c) 2015-2020, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Vermicelli: single-byte and double-byte acceleration. + */ + +#ifndef VERMICELLI_H +#define VERMICELLI_H + +#include "util/bitutils.h" +#include "util/simd_utils.h" +#include "util/unaligned.h" + +#include "vermicelli_sse.h" + +static really_inline +const u8 *vermicelliExec(char c, char nocase, const u8 *buf, + const u8 *buf_end) { + DEBUG_PRINTF("verm scan %s\\x%02hhx over %zu bytes\n", + nocase ? "nocase " : "", c, (size_t)(buf_end - buf)); + assert(buf < buf_end); + + VERM_TYPE chars = VERM_SET_FN(c); /* nocase already uppercase */ + + // Handle small scans. +#ifdef HAVE_AVX512 + if (buf_end - buf <= VERM_BOUNDARY) { + const u8 *ptr = nocase + ? vermMiniNocase(chars, buf, buf_end, 0) + : vermMini(chars, buf, buf_end, 0); + if (ptr) { + return ptr; + } + return buf_end; + } +#else + if (buf_end - buf < VERM_BOUNDARY) { + for (; buf < buf_end; buf++) { + char cur = (char)*buf; + if (nocase) { + cur &= CASE_CLEAR; + } + if (cur == c) { + break; + } + } + return buf; + } +#endif + + uintptr_t min = (uintptr_t)buf % VERM_BOUNDARY; + if (min) { + // Input isn't aligned, so we need to run one iteration with an + // unaligned load, then skip buf forward to the next aligned address. + // There's some small overlap here, but we don't mind scanning it twice + // if we can do it quickly, do we? + const u8 *ptr = nocase ? vermUnalignNocase(chars, buf, 0) + : vermUnalign(chars, buf, 0); + if (ptr) { + return ptr; + } + + buf += VERM_BOUNDARY - min; + assert(buf < buf_end); + } + + // Aligned loops from here on in + const u8 *ptr = nocase ? vermSearchAlignedNocase(chars, buf, buf_end - 1, 0) + : vermSearchAligned(chars, buf, buf_end - 1, 0); + if (ptr) { + return ptr; + } + + // Tidy up the mess at the end + ptr = nocase ? vermUnalignNocase(chars, buf_end - VERM_BOUNDARY, 0) + : vermUnalign(chars, buf_end - VERM_BOUNDARY, 0); + return ptr ? ptr : buf_end; +} + +/* like vermicelliExec except returns the address of the first character which + * is not c */ +static really_inline +const u8 *nvermicelliExec(char c, char nocase, const u8 *buf, + const u8 *buf_end) { + DEBUG_PRINTF("nverm scan %s\\x%02hhx over %zu bytes\n", + nocase ? "nocase " : "", c, (size_t)(buf_end - buf)); + assert(buf < buf_end); + + VERM_TYPE chars = VERM_SET_FN(c); /* nocase already uppercase */ + + // Handle small scans. +#ifdef HAVE_AVX512 + if (buf_end - buf <= VERM_BOUNDARY) { + const u8 *ptr = nocase + ? vermMiniNocase(chars, buf, buf_end, 1) + : vermMini(chars, buf, buf_end, 1); + if (ptr) { + return ptr; + } + return buf_end; + } +#else + if (buf_end - buf < VERM_BOUNDARY) { + for (; buf < buf_end; buf++) { + char cur = (char)*buf; + if (nocase) { + cur &= CASE_CLEAR; + } + if (cur != c) { + break; + } + } + return buf; + } +#endif + + size_t min = (size_t)buf % VERM_BOUNDARY; + if (min) { + // Input isn't aligned, so we need to run one iteration with an + // unaligned load, then skip buf forward to the next aligned address. + // There's some small overlap here, but we don't mind scanning it twice + // if we can do it quickly, do we? + const u8 *ptr = nocase ? vermUnalignNocase(chars, buf, 1) + : vermUnalign(chars, buf, 1); + if (ptr) { + return ptr; + } + + buf += VERM_BOUNDARY - min; + assert(buf < buf_end); + } + + // Aligned loops from here on in + const u8 *ptr = nocase ? vermSearchAlignedNocase(chars, buf, buf_end - 1, 1) + : vermSearchAligned(chars, buf, buf_end - 1, 1); + if (ptr) { + return ptr; + } + + // Tidy up the mess at the end + ptr = nocase ? vermUnalignNocase(chars, buf_end - VERM_BOUNDARY, 1) + : vermUnalign(chars, buf_end - VERM_BOUNDARY, 1); + return ptr ? ptr : buf_end; +} + +static really_inline +const u8 *vermicelliDoubleExec(char c1, char c2, char nocase, const u8 *buf, + const u8 *buf_end) { + DEBUG_PRINTF("double verm scan %s\\x%02hhx%02hhx over %zu bytes\n", + nocase ? "nocase " : "", c1, c2, (size_t)(buf_end - buf)); + assert(buf < buf_end); + + VERM_TYPE chars1 = VERM_SET_FN(c1); /* nocase already uppercase */ + VERM_TYPE chars2 = VERM_SET_FN(c2); /* nocase already uppercase */ + +#ifdef HAVE_AVX512 + if (buf_end - buf <= VERM_BOUNDARY) { + const u8 *ptr = nocase + ? dvermMiniNocase(chars1, chars2, buf, buf_end) + : dvermMini(chars1, chars2, buf, buf_end); + if (ptr) { + return ptr; + } + + /* check for partial match at end */ + u8 mask = nocase ? CASE_CLEAR : 0xff; + if ((buf_end[-1] & mask) == (u8)c1) { + DEBUG_PRINTF("partial!!!\n"); + return buf_end - 1; + } + + return buf_end; + } +#endif + + assert((buf_end - buf) >= VERM_BOUNDARY); + uintptr_t min = (uintptr_t)buf % VERM_BOUNDARY; + if (min) { + // Input isn't aligned, so we need to run one iteration with an + // unaligned load, then skip buf forward to the next aligned address. + // There's some small overlap here, but we don't mind scanning it twice + // if we can do it quickly, do we? + const u8 *ptr = nocase + ? dvermPreconditionNocase(chars1, chars2, buf) + : dvermPrecondition(chars1, chars2, buf); + if (ptr) { + return ptr; + } + + buf += VERM_BOUNDARY - min; + assert(buf < buf_end); + } + + // Aligned loops from here on in + const u8 *ptr = nocase ? dvermSearchAlignedNocase(chars1, chars2, c1, c2, + buf, buf_end) + : dvermSearchAligned(chars1, chars2, c1, c2, buf, + buf_end); + if (ptr) { + return ptr; + } + + // Tidy up the mess at the end + ptr = nocase ? dvermPreconditionNocase(chars1, chars2, + buf_end - VERM_BOUNDARY) + : dvermPrecondition(chars1, chars2, buf_end - VERM_BOUNDARY); + + if (ptr) { + return ptr; + } + + /* check for partial match at end */ + u8 mask = nocase ? CASE_CLEAR : 0xff; + if ((buf_end[-1] & mask) == (u8)c1) { + DEBUG_PRINTF("partial!!!\n"); + return buf_end - 1; + } + + return buf_end; +} + +static really_inline +const u8 *vermicelliDoubleMaskedExec(char c1, char c2, char m1, char m2, + const u8 *buf, const u8 *buf_end) { + DEBUG_PRINTF("double verm scan (\\x%02hhx&\\x%02hhx)(\\x%02hhx&\\x%02hhx) " + "over %zu bytes\n", c1, m1, c2, m2, (size_t)(buf_end - buf)); + assert(buf < buf_end); + + VERM_TYPE chars1 = VERM_SET_FN(c1); + VERM_TYPE chars2 = VERM_SET_FN(c2); + VERM_TYPE mask1 = VERM_SET_FN(m1); + VERM_TYPE mask2 = VERM_SET_FN(m2); + +#ifdef HAVE_AVX512 + if (buf_end - buf <= VERM_BOUNDARY) { + const u8 *ptr = dvermMiniMasked(chars1, chars2, mask1, mask2, buf, + buf_end); + if (ptr) { + return ptr; + } + + /* check for partial match at end */ + if ((buf_end[-1] & m1) == (u8)c1) { + DEBUG_PRINTF("partial!!!\n"); + return buf_end - 1; + } + + return buf_end; + } +#endif + + assert((buf_end - buf) >= VERM_BOUNDARY); + uintptr_t min = (uintptr_t)buf % VERM_BOUNDARY; + if (min) { + // Input isn't aligned, so we need to run one iteration with an + // unaligned load, then skip buf forward to the next aligned address. + // There's some small overlap here, but we don't mind scanning it twice + // if we can do it quickly, do we? + const u8 *p = dvermPreconditionMasked(chars1, chars2, mask1, mask2, buf); + if (p) { + return p; + } + + buf += VERM_BOUNDARY - min; + assert(buf < buf_end); + } + + // Aligned loops from here on in + const u8 *ptr = dvermSearchAlignedMasked(chars1, chars2, mask1, mask2, c1, + c2, m1, m2, buf, buf_end); + if (ptr) { + return ptr; + } + + // Tidy up the mess at the end + ptr = dvermPreconditionMasked(chars1, chars2, mask1, mask2, + buf_end - VERM_BOUNDARY); + + if (ptr) { + return ptr; + } + + /* check for partial match at end */ + if ((buf_end[-1] & m1) == (u8)c1) { + DEBUG_PRINTF("partial!!!\n"); + return buf_end - 1; + } + + return buf_end; +} + +// Reverse vermicelli scan. Provides exact semantics and returns (buf - 1) if +// character not found. +static really_inline +const u8 *rvermicelliExec(char c, char nocase, const u8 *buf, + const u8 *buf_end) { + DEBUG_PRINTF("rev verm scan %s\\x%02hhx over %zu bytes\n", + nocase ? "nocase " : "", c, (size_t)(buf_end - buf)); + assert(buf < buf_end); + + VERM_TYPE chars = VERM_SET_FN(c); /* nocase already uppercase */ + + // Handle small scans. +#ifdef HAVE_AVX512 + if (buf_end - buf <= VERM_BOUNDARY) { + const u8 *ptr = nocase + ? rvermMiniNocase(chars, buf, buf_end, 0) + : rvermMini(chars, buf, buf_end, 0); + if (ptr) { + return ptr; + } + return buf - 1; + } +#else + if (buf_end - buf < VERM_BOUNDARY) { + for (buf_end--; buf_end >= buf; buf_end--) { + char cur = (char)*buf_end; + if (nocase) { + cur &= CASE_CLEAR; + } + if (cur == c) { + break; + } + } + return buf_end; + } +#endif + + size_t min = (size_t)buf_end % VERM_BOUNDARY; + if (min) { + // Input isn't aligned, so we need to run one iteration with an + // unaligned load, then skip buf backward to the next aligned address. + // There's some small overlap here, but we don't mind scanning it twice + // if we can do it quickly, do we? + const u8 *ptr = nocase ? rvermUnalignNocase(chars, + buf_end - VERM_BOUNDARY, + 0) + : rvermUnalign(chars, buf_end - VERM_BOUNDARY, + 0); + + if (ptr) { + return ptr; + } + + buf_end -= min; + if (buf >= buf_end) { + return buf_end; + } + } + + // Aligned loops from here on in. + const u8 *ptr = nocase ? rvermSearchAlignedNocase(chars, buf, buf_end, 0) + : rvermSearchAligned(chars, buf, buf_end, 0); + if (ptr) { + return ptr; + } + + // Tidy up the mess at the end, return buf - 1 if not found. + ptr = nocase ? rvermUnalignNocase(chars, buf, 0) + : rvermUnalign(chars, buf, 0); + return ptr ? ptr : buf - 1; +} + +/* like rvermicelliExec except returns the address of the last character which + * is not c */ +static really_inline +const u8 *rnvermicelliExec(char c, char nocase, const u8 *buf, + const u8 *buf_end) { + DEBUG_PRINTF("rev verm scan %s\\x%02hhx over %zu bytes\n", + nocase ? "nocase " : "", c, (size_t)(buf_end - buf)); + assert(buf < buf_end); + + VERM_TYPE chars = VERM_SET_FN(c); /* nocase already uppercase */ + + // Handle small scans. +#ifdef HAVE_AVX512 + if (buf_end - buf <= VERM_BOUNDARY) { + const u8 *ptr = nocase + ? rvermMiniNocase(chars, buf, buf_end, 1) + : rvermMini(chars, buf, buf_end, 1); + if (ptr) { + return ptr; + } + return buf - 1; + } +#else + if (buf_end - buf < VERM_BOUNDARY) { + for (buf_end--; buf_end >= buf; buf_end--) { + char cur = (char)*buf_end; + if (nocase) { + cur &= CASE_CLEAR; + } + if (cur != c) { + break; + } + } + return buf_end; + } +#endif + + size_t min = (size_t)buf_end % VERM_BOUNDARY; + if (min) { + // Input isn't aligned, so we need to run one iteration with an + // unaligned load, then skip buf backward to the next aligned address. + // There's some small overlap here, but we don't mind scanning it twice + // if we can do it quickly, do we? + const u8 *ptr = nocase ? rvermUnalignNocase(chars, + buf_end - VERM_BOUNDARY, + 1) + : rvermUnalign(chars, buf_end - VERM_BOUNDARY, + 1); + + if (ptr) { + return ptr; + } + + buf_end -= min; + if (buf >= buf_end) { + return buf_end; + } + } + + // Aligned loops from here on in. + const u8 *ptr = nocase ? rvermSearchAlignedNocase(chars, buf, buf_end, 1) + : rvermSearchAligned(chars, buf, buf_end, 1); + if (ptr) { + return ptr; + } + + // Tidy up the mess at the end, return buf - 1 if not found. + ptr = nocase ? rvermUnalignNocase(chars, buf, 1) + : rvermUnalign(chars, buf, 1); + return ptr ? ptr : buf - 1; +} + +/* returns highest offset of c2 (NOTE: not c1) */ +static really_inline +const u8 *rvermicelliDoubleExec(char c1, char c2, char nocase, const u8 *buf, + const u8 *buf_end) { + DEBUG_PRINTF("rev double verm scan %s\\x%02hhx%02hhx over %zu bytes\n", + nocase ? "nocase " : "", c1, c2, (size_t)(buf_end - buf)); + assert(buf < buf_end); + + VERM_TYPE chars1 = VERM_SET_FN(c1); /* nocase already uppercase */ + VERM_TYPE chars2 = VERM_SET_FN(c2); /* nocase already uppercase */ + +#ifdef HAVE_AVX512 + if (buf_end - buf <= VERM_BOUNDARY) { + const u8 *ptr = nocase + ? rdvermMiniNocase(chars1, chars2, buf, buf_end) + : rdvermMini(chars1, chars2, buf, buf_end); + + if (ptr) { + return ptr; + } + + // check for partial match at end ??? + return buf - 1; + } +#endif + + assert((buf_end - buf) >= VERM_BOUNDARY); + size_t min = (size_t)buf_end % VERM_BOUNDARY; + if (min) { + // input not aligned, so we need to run one iteration with an unaligned + // load, then skip buf forward to the next aligned address. There's + // some small overlap here, but we don't mind scanning it twice if we + // can do it quickly, do we? + const u8 *ptr = nocase ? rdvermPreconditionNocase(chars1, chars2, + buf_end - VERM_BOUNDARY) + : rdvermPrecondition(chars1, chars2, + buf_end - VERM_BOUNDARY); + + if (ptr) { + return ptr; + } + + buf_end -= min; + if (buf >= buf_end) { + return buf_end; + } + } + + // Aligned loops from here on in + if (nocase) { + return rdvermSearchAlignedNocase(chars1, chars2, c1, c2, buf, buf_end); + } else { + return rdvermSearchAligned(chars1, chars2, c1, c2, buf, buf_end); + } +} + +#endif /* VERMICELLI_H */ diff --git a/regex/nfa/vermicelli_run.h b/regex/nfa/vermicelli_run.h new file mode 100644 index 000000000..d6fe7ec78 --- /dev/null +++ b/regex/nfa/vermicelli_run.h @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2015, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "vermicelli.h" + +static really_inline +const u8 *find_xverm_run(char c, char nocase, u32 repeat, UNUSED const u8 *buf, + const u8 *buf_start, const u8 *buf_end, char negate) { + DEBUG_PRINTF("looking for 0x%hhx{%u} in %p [%zd, %zd)\n", c, repeat, buf, + buf_start - buf, buf_end - buf); + + /* TODO optimise on where it is easy to get a dense bitfield of character + * matches */ + if (repeat == 1) { + return negate ? nvermicelliExec(c, nocase, buf_start, buf_end) + : vermicelliExec(c, nocase, buf_start, buf_end); + } + + while (1) { + const u8 *s; + if (negate) { + s = nvermicelliExec(c, nocase, buf_start, buf_end); + } else if (buf_end - buf_start >= VERM_BOUNDARY && !nocase) { + s = vermicelliDoubleExec(c, c, nocase, buf_start, buf_end); + + if (s != buf_end && *s != c) { /* double verm is not certain to be + * precise */ + s = vermicelliExec(c, nocase, s, buf_end); + } + } else { + s = vermicelliExec(c, nocase, buf_start, buf_end); + } + if (s == buf_end) { + return s; + } + + DEBUG_PRINTF("cand %zd\n", s - buf); + + const u8 *test_e = MIN(s + repeat, buf_end); + + const u8 *rv = negate ? vermicelliExec(c, nocase, s, test_e) + : nvermicelliExec(c, nocase, s, test_e); + + assert(rv > buf_start); + assert(rv <= buf_end); + + if (rv == test_e) { + return s; + } + + buf_start = rv; + } +} + +static really_inline +const u8 *find_verm_run(char c, char nocase, u32 repeat, const u8 *buf, + const u8 *buf_start, const u8 *buf_end) { + return find_xverm_run(c, nocase, repeat, buf, buf_start, buf_end, 0); +} + +static really_inline +const u8 *find_nverm_run(char c, char nocase, u32 repeat, const u8 *buf, + const u8 *buf_start, const u8 *buf_end) { + return find_xverm_run(c, nocase, repeat, buf, buf_start, buf_end, 1); +} diff --git a/regex/nfa/vermicelli_sse.h b/regex/nfa/vermicelli_sse.h new file mode 100644 index 000000000..3307486cf --- /dev/null +++ b/regex/nfa/vermicelli_sse.h @@ -0,0 +1,889 @@ +/* + * Copyright (c) 2015-2020, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Vermicelli: Intel SSE implementation. + * + * (users should include vermicelli.h) + */ + +#if !defined(HAVE_AVX512) + +#define VERM_BOUNDARY 16 +#define VERM_TYPE m128 +#define VERM_SET_FN set16x8 + +static really_inline +const u8 *vermSearchAligned(m128 chars, const u8 *buf, const u8 *buf_end, + char negate) { + assert((size_t)buf % 16 == 0); + for (; buf + 31 < buf_end; buf += 32) { + m128 data = load128(buf); + u32 z1 = movemask128(eq128(chars, data)); + m128 data2 = load128(buf + 16); + u32 z2 = movemask128(eq128(chars, data2)); + u32 z = z1 | (z2 << 16); + if (negate) { + z = ~z; + } + if (unlikely(z)) { + u32 pos = ctz32(z); + return buf + pos; + } + } + for (; buf + 15 < buf_end; buf += 16) { + m128 data = load128(buf); + u32 z = movemask128(eq128(chars, data)); + if (negate) { + z = ~z & 0xffff; + } + if (unlikely(z)) { + u32 pos = ctz32(z); + return buf + pos; + } + } + return NULL; +} + +static really_inline +const u8 *vermSearchAlignedNocase(m128 chars, const u8 *buf, + const u8 *buf_end, char negate) { + assert((size_t)buf % 16 == 0); + m128 casemask = set16x8(CASE_CLEAR); + + for (; buf + 31 < buf_end; buf += 32) { + m128 data = load128(buf); + u32 z1 = movemask128(eq128(chars, and128(casemask, data))); + m128 data2 = load128(buf + 16); + u32 z2 = movemask128(eq128(chars, and128(casemask, data2))); + u32 z = z1 | (z2 << 16); + if (negate) { + z = ~z; + } + if (unlikely(z)) { + u32 pos = ctz32(z); + return buf + pos; + } + } + + for (; buf + 15 < buf_end; buf += 16) { + m128 data = load128(buf); + u32 z = movemask128(eq128(chars, and128(casemask, data))); + if (negate) { + z = ~z & 0xffff; + } + if (unlikely(z)) { + u32 pos = ctz32(z); + return buf + pos; + } + } + return NULL; +} + +// returns NULL if not found +static really_inline +const u8 *vermUnalign(m128 chars, const u8 *buf, char negate) { + m128 data = loadu128(buf); // unaligned + u32 z = movemask128(eq128(chars, data)); + if (negate) { + z = ~z & 0xffff; + } + if (unlikely(z)) { + return buf + ctz32(z); + } + return NULL; +} + +// returns NULL if not found +static really_inline +const u8 *vermUnalignNocase(m128 chars, const u8 *buf, char negate) { + m128 casemask = set16x8(CASE_CLEAR); + m128 data = loadu128(buf); // unaligned + u32 z = movemask128(eq128(chars, and128(casemask, data))); + if (negate) { + z = ~z & 0xffff; + } + if (unlikely(z)) { + return buf + ctz32(z); + } + return NULL; +} + +static really_inline +const u8 *dvermSearchAligned(m128 chars1, m128 chars2, u8 c1, u8 c2, + const u8 *buf, const u8 *buf_end) { + for (; buf + 16 < buf_end; buf += 16) { + m128 data = load128(buf); + u32 z = movemask128(and128(eq128(chars1, data), + rshiftbyte_m128(eq128(chars2, data), 1))); + if (buf[15] == c1 && buf[16] == c2) { + z |= (1 << 15); + } + if (unlikely(z)) { + u32 pos = ctz32(z); + return buf + pos; + } + } + + return NULL; +} + +static really_inline +const u8 *dvermSearchAlignedNocase(m128 chars1, m128 chars2, u8 c1, u8 c2, + const u8 *buf, const u8 *buf_end) { + assert((size_t)buf % 16 == 0); + m128 casemask = set16x8(CASE_CLEAR); + + for (; buf + 16 < buf_end; buf += 16) { + m128 data = load128(buf); + m128 v = and128(casemask, data); + u32 z = movemask128(and128(eq128(chars1, v), + rshiftbyte_m128(eq128(chars2, v), 1))); + if ((buf[15] & CASE_CLEAR) == c1 && (buf[16] & CASE_CLEAR) == c2) { + z |= (1 << 15); + } + if (unlikely(z)) { + u32 pos = ctz32(z); + return buf + pos; + } + } + + return NULL; +} + +static really_inline +const u8 *dvermSearchAlignedMasked(m128 chars1, m128 chars2, + m128 mask1, m128 mask2, u8 c1, u8 c2, u8 m1, + u8 m2, const u8 *buf, const u8 *buf_end) { + assert((size_t)buf % 16 == 0); + + for (; buf + 16 < buf_end; buf += 16) { + m128 data = load128(buf); + m128 v1 = eq128(chars1, and128(data, mask1)); + m128 v2 = eq128(chars2, and128(data, mask2)); + u32 z = movemask128(and128(v1, rshiftbyte_m128(v2, 1))); + + if ((buf[15] & m1) == c1 && (buf[16] & m2) == c2) { + z |= (1 << 15); + } + if (unlikely(z)) { + u32 pos = ctz32(z); + return buf + pos; + } + } + + return NULL; +} + +// returns NULL if not found +static really_inline +const u8 *dvermPrecondition(m128 chars1, m128 chars2, const u8 *buf) { + m128 data = loadu128(buf); // unaligned + u32 z = movemask128(and128(eq128(chars1, data), + rshiftbyte_m128(eq128(chars2, data), 1))); + + /* no fixup of the boundary required - the aligned run will pick it up */ + if (unlikely(z)) { + u32 pos = ctz32(z); + return buf + pos; + } + return NULL; +} + +// returns NULL if not found +static really_inline +const u8 *dvermPreconditionNocase(m128 chars1, m128 chars2, const u8 *buf) { + /* due to laziness, nonalphas and nocase having interesting behaviour */ + m128 casemask = set16x8(CASE_CLEAR); + m128 data = loadu128(buf); // unaligned + m128 v = and128(casemask, data); + u32 z = movemask128(and128(eq128(chars1, v), + rshiftbyte_m128(eq128(chars2, v), 1))); + + /* no fixup of the boundary required - the aligned run will pick it up */ + if (unlikely(z)) { + u32 pos = ctz32(z); + return buf + pos; + } + return NULL; +} + +// returns NULL if not found +static really_inline +const u8 *dvermPreconditionMasked(m128 chars1, m128 chars2, + m128 mask1, m128 mask2, const u8 *buf) { + m128 data = loadu128(buf); // unaligned + m128 v1 = eq128(chars1, and128(data, mask1)); + m128 v2 = eq128(chars2, and128(data, mask2)); + u32 z = movemask128(and128(v1, rshiftbyte_m128(v2, 1))); + + /* no fixup of the boundary required - the aligned run will pick it up */ + if (unlikely(z)) { + u32 pos = ctz32(z); + return buf + pos; + } + return NULL; +} + +static really_inline +const u8 *lastMatchOffset(const u8 *buf_end, u32 z) { + assert(z); + return buf_end - 16 + 31 - clz32(z); +} + +static really_inline +const u8 *rvermSearchAligned(m128 chars, const u8 *buf, const u8 *buf_end, + char negate) { + assert((size_t)buf_end % 16 == 0); + for (; buf + 15 < buf_end; buf_end -= 16) { + m128 data = load128(buf_end - 16); + u32 z = movemask128(eq128(chars, data)); + if (negate) { + z = ~z & 0xffff; + } + if (unlikely(z)) { + return lastMatchOffset(buf_end, z); + } + } + return NULL; +} + +static really_inline +const u8 *rvermSearchAlignedNocase(m128 chars, const u8 *buf, + const u8 *buf_end, char negate) { + assert((size_t)buf_end % 16 == 0); + m128 casemask = set16x8(CASE_CLEAR); + + for (; buf + 15 < buf_end; buf_end -= 16) { + m128 data = load128(buf_end - 16); + u32 z = movemask128(eq128(chars, and128(casemask, data))); + if (negate) { + z = ~z & 0xffff; + } + if (unlikely(z)) { + return lastMatchOffset(buf_end, z); + } + } + return NULL; +} + +// returns NULL if not found +static really_inline +const u8 *rvermUnalign(m128 chars, const u8 *buf, char negate) { + m128 data = loadu128(buf); // unaligned + u32 z = movemask128(eq128(chars, data)); + if (negate) { + z = ~z & 0xffff; + } + if (unlikely(z)) { + return lastMatchOffset(buf + 16, z); + } + return NULL; +} + +// returns NULL if not found +static really_inline +const u8 *rvermUnalignNocase(m128 chars, const u8 *buf, char negate) { + m128 casemask = set16x8(CASE_CLEAR); + m128 data = loadu128(buf); // unaligned + u32 z = movemask128(eq128(chars, and128(casemask, data))); + if (negate) { + z = ~z & 0xffff; + } + if (unlikely(z)) { + return lastMatchOffset(buf + 16, z); + } + return NULL; +} + +static really_inline +const u8 *rdvermSearchAligned(m128 chars1, m128 chars2, u8 c1, u8 c2, + const u8 *buf, const u8 *buf_end) { + assert((size_t)buf_end % 16 == 0); + + for (; buf + 16 < buf_end; buf_end -= 16) { + m128 data = load128(buf_end - 16); + u32 z = movemask128(and128(eq128(chars2, data), + lshiftbyte_m128(eq128(chars1, data), 1))); + if (buf_end[-17] == c1 && buf_end[-16] == c2) { + z |= 1; + } + if (unlikely(z)) { + return lastMatchOffset(buf_end, z); + } + } + return buf_end; +} + +static really_inline +const u8 *rdvermSearchAlignedNocase(m128 chars1, m128 chars2, u8 c1, u8 c2, + const u8 *buf, const u8 *buf_end) { + assert((size_t)buf_end % 16 == 0); + m128 casemask = set16x8(CASE_CLEAR); + + for (; buf + 16 < buf_end; buf_end -= 16) { + m128 data = load128(buf_end - 16); + m128 v = and128(casemask, data); + u32 z = movemask128(and128(eq128(chars2, v), + lshiftbyte_m128(eq128(chars1, v), 1))); + if ((buf_end[-17] & CASE_CLEAR) == c1 + && (buf_end[-16] & CASE_CLEAR) == c2) { + z |= 1; + } + if (unlikely(z)) { + return lastMatchOffset(buf_end, z); + } + } + return buf_end; +} + +// returns NULL if not found +static really_inline +const u8 *rdvermPrecondition(m128 chars1, m128 chars2, const u8 *buf) { + m128 data = loadu128(buf); + u32 z = movemask128(and128(eq128(chars2, data), + lshiftbyte_m128(eq128(chars1, data), 1))); + + /* no fixup of the boundary required - the aligned run will pick it up */ + if (unlikely(z)) { + return lastMatchOffset(buf + 16, z); + } + + return NULL; +} + +// returns NULL if not found +static really_inline +const u8 *rdvermPreconditionNocase(m128 chars1, m128 chars2, const u8 *buf) { + /* due to laziness, nonalphas and nocase having interesting behaviour */ + m128 casemask = set16x8(CASE_CLEAR); + m128 data = loadu128(buf); + m128 v = and128(casemask, data); + u32 z = movemask128(and128(eq128(chars2, v), + lshiftbyte_m128(eq128(chars1, v), 1))); + /* no fixup of the boundary required - the aligned run will pick it up */ + if (unlikely(z)) { + return lastMatchOffset(buf + 16, z); + } + + return NULL; +} + +#else // HAVE_AVX512 + +#define VERM_BOUNDARY 64 +#define VERM_TYPE m512 +#define VERM_SET_FN set64x8 + +static really_inline +const u8 *vermMini(m512 chars, const u8 *buf, const u8 *buf_end, char negate) { + uintptr_t len = buf_end - buf; + __mmask64 mask = (~0ULL) >> (64 - len); + m512 data = loadu_maskz_m512(mask, buf); + + u64a z = eq512mask(chars, data); + + if (negate) { + z = ~z & mask; + } + z &= mask; + if (unlikely(z)) { + return buf + ctz64(z); + } + return NULL; +} + +static really_inline +const u8 *vermMiniNocase(m512 chars, const u8 *buf, const u8 *buf_end, + char negate) { + uintptr_t len = buf_end - buf; + __mmask64 mask = (~0ULL) >> (64 - len); + m512 data = loadu_maskz_m512(mask, buf); + m512 casemask = set64x8(CASE_CLEAR); + m512 v = and512(casemask, data); + + u64a z = eq512mask(chars, v); + + if (negate) { + z = ~z & mask; + } + z &= mask; + if (unlikely(z)) { + return buf + ctz64(z); + } + return NULL; +} + +static really_inline +const u8 *vermSearchAligned(m512 chars, const u8 *buf, const u8 *buf_end, + char negate) { + assert((size_t)buf % 64 == 0); + for (; buf + 63 < buf_end; buf += 64) { + m512 data = load512(buf); + u64a z = eq512mask(chars, data); + if (negate) { + z = ~z & ~0ULL; + } + if (unlikely(z)) { + u64a pos = ctz64(z); + return buf + pos; + } + } + return NULL; +} + +static really_inline +const u8 *vermSearchAlignedNocase(m512 chars, const u8 *buf, + const u8 *buf_end, char negate) { + assert((size_t)buf % 64 == 0); + m512 casemask = set64x8(CASE_CLEAR); + + for (; buf + 63 < buf_end; buf += 64) { + m512 data = load512(buf); + u64a z = eq512mask(chars, and512(casemask, data)); + if (negate) { + z = ~z & ~0ULL; + } + if (unlikely(z)) { + u64a pos = ctz64(z); + return buf + pos; + } + } + return NULL; +} + +// returns NULL if not found +static really_inline +const u8 *vermUnalign(m512 chars, const u8 *buf, char negate) { + m512 data = loadu512(buf); // unaligned + u64a z = eq512mask(chars, data); + if (negate) { + z = ~z & ~0ULL; + } + if (unlikely(z)) { + return buf + ctz64(z); + } + return NULL; +} + +// returns NULL if not found +static really_inline +const u8 *vermUnalignNocase(m512 chars, const u8 *buf, char negate) { + m512 casemask = set64x8(CASE_CLEAR); + m512 data = loadu512(buf); // unaligned + u64a z = eq512mask(chars, and512(casemask, data)); + if (negate) { + z = ~z & ~0ULL; + } + if (unlikely(z)) { + return buf + ctz64(z); + } + return NULL; +} + +static really_inline +const u8 *dvermMini(m512 chars1, m512 chars2, const u8 *buf, + const u8 *buf_end) { + uintptr_t len = buf_end - buf; + __mmask64 mask = (~0ULL) >> (64 - len); + m512 data = loadu_maskz_m512(mask, buf); + + u64a z = eq512mask(chars1, data) & (eq512mask(chars2, data) >> 1); + + z &= mask; + if (unlikely(z)) { + u64a pos = ctz64(z); + return buf + pos; + } + return NULL; +} + +static really_inline +const u8 *dvermMiniNocase(m512 chars1, m512 chars2, const u8 *buf, + const u8 *buf_end) { + uintptr_t len = buf_end - buf; + __mmask64 mask = (~0ULL) >> (64 - len); + m512 data = loadu_maskz_m512(mask, buf); + m512 casemask = set64x8(CASE_CLEAR); + m512 v = and512(casemask, data); + + u64a z = eq512mask(chars1, v) & (eq512mask(chars2, v) >> 1); + + z &= mask; + if (unlikely(z)) { + u64a pos = ctz64(z); + return buf + pos; + } + return NULL; +} + +static really_inline +const u8 *dvermMiniMasked(m512 chars1, m512 chars2, m512 mask1, m512 mask2, + const u8 *buf, const u8 *buf_end) { + uintptr_t len = buf_end - buf; + __mmask64 mask = (~0ULL) >> (64 - len); + m512 data = loadu_maskz_m512(mask, buf); + m512 v1 = and512(data, mask1); + m512 v2 = and512(data, mask2); + + u64a z = eq512mask(chars1, v1) & (eq512mask(chars2, v2) >> 1); + + z &= mask; + if (unlikely(z)) { + u64a pos = ctz64(z); + return buf + pos; + } + return NULL; +} + +static really_inline +const u8 *dvermSearchAligned(m512 chars1, m512 chars2, u8 c1, u8 c2, + const u8 *buf, const u8 *buf_end) { + for (; buf + 64 < buf_end; buf += 64) { + m512 data = load512(buf); + u64a z = eq512mask(chars1, data) & (eq512mask(chars2, data) >> 1); + if (buf[63] == c1 && buf[64] == c2) { + z |= (1ULL << 63); + } + if (unlikely(z)) { + u64a pos = ctz64(z); + return buf + pos; + } + } + + return NULL; +} + +static really_inline +const u8 *dvermSearchAlignedNocase(m512 chars1, m512 chars2, u8 c1, u8 c2, + const u8 *buf, const u8 *buf_end) { + assert((size_t)buf % 64 == 0); + m512 casemask = set64x8(CASE_CLEAR); + + for (; buf + 64 < buf_end; buf += 64) { + m512 data = load512(buf); + m512 v = and512(casemask, data); + u64a z = eq512mask(chars1, v) & (eq512mask(chars2, v) >> 1); + if ((buf[63] & CASE_CLEAR) == c1 && (buf[64] & CASE_CLEAR) == c2) { + z |= (1ULL << 63); + } + if (unlikely(z)) { + u64a pos = ctz64(z); + return buf + pos; + } + } + + return NULL; +} + +static really_inline +const u8 *dvermSearchAlignedMasked(m512 chars1, m512 chars2, + m512 mask1, m512 mask2, u8 c1, u8 c2, u8 m1, + u8 m2, const u8 *buf, const u8 *buf_end) { + assert((size_t)buf % 64 == 0); + + for (; buf + 64 < buf_end; buf += 64) { + m512 data = load512(buf); + m512 v1 = and512(data, mask1); + m512 v2 = and512(data, mask2); + u64a z = eq512mask(chars1, v1) & (eq512mask(chars2, v2) >> 1); + + if ((buf[63] & m1) == c1 && (buf[64] & m2) == c2) { + z |= (1ULL << 63); + } + if (unlikely(z)) { + u64a pos = ctz64(z); + return buf + pos; + } + } + + return NULL; +} + +// returns NULL if not found +static really_inline +const u8 *dvermPrecondition(m512 chars1, m512 chars2, const u8 *buf) { + m512 data = loadu512(buf); // unaligned + u64a z = eq512mask(chars1, data) & (eq512mask(chars2, data) >> 1); + + /* no fixup of the boundary required - the aligned run will pick it up */ + if (unlikely(z)) { + u64a pos = ctz64(z); + return buf + pos; + } + return NULL; +} + +// returns NULL if not found +static really_inline +const u8 *dvermPreconditionNocase(m512 chars1, m512 chars2, const u8 *buf) { + /* due to laziness, nonalphas and nocase having interesting behaviour */ + m512 casemask = set64x8(CASE_CLEAR); + m512 data = loadu512(buf); // unaligned + m512 v = and512(casemask, data); + u64a z = eq512mask(chars1, v) & (eq512mask(chars2, v) >> 1); + + /* no fixup of the boundary required - the aligned run will pick it up */ + if (unlikely(z)) { + u64a pos = ctz64(z); + return buf + pos; + } + return NULL; +} + +// returns NULL if not found +static really_inline +const u8 *dvermPreconditionMasked(m512 chars1, m512 chars2, + m512 mask1, m512 mask2, const u8 *buf) { + m512 data = loadu512(buf); // unaligned + m512 v1 = and512(data, mask1); + m512 v2 = and512(data, mask2); + u64a z = eq512mask(chars1, v1) & (eq512mask(chars2, v2) >> 1); + + /* no fixup of the boundary required - the aligned run will pick it up */ + if (unlikely(z)) { + u64a pos = ctz64(z); + return buf + pos; + } + return NULL; +} + +static really_inline +const u8 *lastMatchOffset(const u8 *buf_end, u64a z) { + assert(z); + return buf_end - 64 + 63 - clz64(z); +} + +static really_inline +const u8 *rvermMini(m512 chars, const u8 *buf, const u8 *buf_end, char negate) { + uintptr_t len = buf_end - buf; + __mmask64 mask = (~0ULL) >> (64 - len); + m512 data = loadu_maskz_m512(mask, buf); + + u64a z = eq512mask(chars, data); + + if (negate) { + z = ~z & mask; + } + z &= mask; + if (unlikely(z)) { + return lastMatchOffset(buf + 64, z); + } + return NULL; +} + +static really_inline +const u8 *rvermMiniNocase(m512 chars, const u8 *buf, const u8 *buf_end, + char negate) { + uintptr_t len = buf_end - buf; + __mmask64 mask = (~0ULL) >> (64 - len); + m512 data = loadu_maskz_m512(mask, buf); + m512 casemask = set64x8(CASE_CLEAR); + m512 v = and512(casemask, data); + + u64a z = eq512mask(chars, v); + + if (negate) { + z = ~z & mask; + } + z &= mask; + if (unlikely(z)) { + return lastMatchOffset(buf + 64, z); + } + return NULL; +} + +static really_inline +const u8 *rvermSearchAligned(m512 chars, const u8 *buf, const u8 *buf_end, + char negate) { + assert((size_t)buf_end % 64 == 0); + for (; buf + 63 < buf_end; buf_end -= 64) { + m512 data = load512(buf_end - 64); + u64a z = eq512mask(chars, data); + if (negate) { + z = ~z & ~0ULL; + } + if (unlikely(z)) { + return lastMatchOffset(buf_end, z); + } + } + return NULL; +} + +static really_inline +const u8 *rvermSearchAlignedNocase(m512 chars, const u8 *buf, + const u8 *buf_end, char negate) { + assert((size_t)buf_end % 64 == 0); + m512 casemask = set64x8(CASE_CLEAR); + + for (; buf + 63 < buf_end; buf_end -= 64) { + m512 data = load512(buf_end - 64); + u64a z = eq512mask(chars, and512(casemask, data)); + if (negate) { + z = ~z & ~0ULL; + } + if (unlikely(z)) { + return lastMatchOffset(buf_end, z); + } + } + return NULL; +} + +// returns NULL if not found +static really_inline +const u8 *rvermUnalign(m512 chars, const u8 *buf, char negate) { + m512 data = loadu512(buf); // unaligned + u64a z = eq512mask(chars, data); + if (negate) { + z = ~z & ~0ULL; + } + if (unlikely(z)) { + return lastMatchOffset(buf + 64, z); + } + return NULL; +} + +// returns NULL if not found +static really_inline +const u8 *rvermUnalignNocase(m512 chars, const u8 *buf, char negate) { + m512 casemask = set64x8(CASE_CLEAR); + m512 data = loadu512(buf); // unaligned + u64a z = eq512mask(chars, and512(casemask, data)); + if (negate) { + z = ~z & ~0ULL; + } + if (unlikely(z)) { + return lastMatchOffset(buf + 64, z); + } + return NULL; +} + +static really_inline +const u8 *rdvermMini(m512 chars1, m512 chars2, const u8 *buf, + const u8 *buf_end) { + uintptr_t len = buf_end - buf; + __mmask64 mask = (~0ULL) >> (64 - len); + m512 data = loadu_maskz_m512(mask, buf); + + u64a z = eq512mask(chars2, data) & (eq512mask(chars1, data) << 1); + + z &= mask; + if (unlikely(z)) { + return lastMatchOffset(buf + 64, z); + } + return NULL; +} + +static really_inline +const u8 *rdvermMiniNocase(m512 chars1, m512 chars2, const u8 *buf, + const u8 *buf_end) { + uintptr_t len = buf_end - buf; + __mmask64 mask = (~0ULL) >> (64 - len); + m512 data = loadu_maskz_m512(mask, buf); + m512 casemask = set64x8(CASE_CLEAR); + m512 v = and512(casemask, data); + + u64a z = eq512mask(chars2, v) & (eq512mask(chars1, v) << 1); + + z &= mask; + if (unlikely(z)) { + return lastMatchOffset(buf + 64, z); + } + return NULL; +} + +static really_inline +const u8 *rdvermSearchAligned(m512 chars1, m512 chars2, u8 c1, u8 c2, + const u8 *buf, const u8 *buf_end) { + assert((size_t)buf_end % 64 == 0); + + for (; buf + 64 < buf_end; buf_end -= 64) { + m512 data = load512(buf_end - 64); + u64a z = eq512mask(chars2, data) & (eq512mask(chars1, data) << 1); + if (buf_end[-65] == c1 && buf_end[-64] == c2) { + z |= 1; + } + if (unlikely(z)) { + return lastMatchOffset(buf_end, z); + } + } + return buf_end; +} + +static really_inline +const u8 *rdvermSearchAlignedNocase(m512 chars1, m512 chars2, u8 c1, u8 c2, + const u8 *buf, const u8 *buf_end) { + assert((size_t)buf_end % 64 == 0); + m512 casemask = set64x8(CASE_CLEAR); + + for (; buf + 64 < buf_end; buf_end -= 64) { + m512 data = load512(buf_end - 64); + m512 v = and512(casemask, data); + u64a z = eq512mask(chars2, v) & (eq512mask(chars1, v) << 1); + if ((buf_end[-65] & CASE_CLEAR) == c1 + && (buf_end[-64] & CASE_CLEAR) == c2) { + z |= 1; + } + if (unlikely(z)) { + return lastMatchOffset(buf_end, z); + } + } + return buf_end; +} + +// returns NULL if not found +static really_inline +const u8 *rdvermPrecondition(m512 chars1, m512 chars2, const u8 *buf) { + m512 data = loadu512(buf); + u64a z = eq512mask(chars2, data) & (eq512mask(chars1, data) << 1); + + // no fixup of the boundary required - the aligned run will pick it up + if (unlikely(z)) { + return lastMatchOffset(buf + 64, z); + } + + return NULL; +} + +// returns NULL if not found +static really_inline +const u8 *rdvermPreconditionNocase(m512 chars1, m512 chars2, const u8 *buf) { + // due to laziness, nonalphas and nocase having interesting behaviour + m512 casemask = set64x8(CASE_CLEAR); + m512 data = loadu512(buf); + m512 v = and512(casemask, data); + u64a z = eq512mask(chars2, v) & (eq512mask(chars1, v) << 1); + // no fixup of the boundary required - the aligned run will pick it up + if (unlikely(z)) { + return lastMatchOffset(buf + 64, z); + } + + return NULL; +} + +#endif // HAVE_AVX512 diff --git a/regex/report.h b/regex/report.h new file mode 100644 index 000000000..b35f4c052 --- /dev/null +++ b/regex/report.h @@ -0,0 +1,392 @@ +/* + * Copyright (c) 2016-2019, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Runtime functions to do with reports, inlined into callers. + */ + +#ifndef REPORT_H +#define REPORT_H + +#include "hs_internal.h" +#include "hs_runtime.h" +#include "scratch.h" +#include "ue2common.h" +#include "nfa/callback.h" +#include "nfa/nfa_internal.h" +#include "rose/runtime.h" +#include "som/som_runtime.h" +#include "util/exhaust.h" +#include "util/logical.h" +#include "util/fatbit.h" + +enum DedupeResult { + DEDUPE_CONTINUE, //!< Continue with match, not a dupe. + DEDUPE_SKIP, //!< Don't report this match, dupe or delayed due to SOM. + DEDUPE_HALT //!< User instructed us to stop matching. +}; + +static really_inline +enum DedupeResult dedupeCatchup(const struct RoseEngine *rose, + struct hs_scratch *scratch, u64a offset, + u64a from_offset, u64a to_offset, u32 dkey, + s32 offset_adjust, char is_external_report, + char quash_som, const char do_som) { + DEBUG_PRINTF("offset=%llu, match=[%llu,%llu], dkey=%u, do_som=%d\n", offset, + from_offset, to_offset, dkey, do_som); + + // We should not have been called if there's no dedupe work to do. + assert(do_som || dkey != MO_INVALID_IDX); + + struct match_deduper *deduper = &scratch->deduper; + if (offset != deduper->current_report_offset) { + assert(deduper->current_report_offset == ~0ULL || + deduper->current_report_offset < offset); + if (offset == deduper->current_report_offset + 1) { + fatbit_clear(deduper->log[offset % 2]); + } else { + fatbit_clear(deduper->log[0]); + fatbit_clear(deduper->log[1]); + } + + if (do_som && flushStoredSomMatches(scratch, offset)) { + return DEDUPE_HALT; + } + deduper->current_report_offset = offset; + } + + if (dkey != MO_INVALID_IDX) { + const u32 dkeyCount = rose->dkeyCount; + if (is_external_report || quash_som) { + DEBUG_PRINTF("checking dkey %u at offset %llu\n", dkey, to_offset); + assert(offset_adjust == 0 || offset_adjust == -1); + if (fatbit_set(deduper->log[to_offset % 2], dkeyCount, dkey)) { + /* we have already raised this report at this offset, squash + * dupe match. */ + DEBUG_PRINTF("dedupe\n"); + return DEDUPE_SKIP; + } + } else if (do_som) { + /* SOM external event */ + DEBUG_PRINTF("checking dkey %u at offset %llu\n", dkey, to_offset); + assert(offset_adjust == 0 || offset_adjust == -1); + u64a *starts = deduper->som_start_log[to_offset % 2]; + if (fatbit_set(deduper->som_log[to_offset % 2], dkeyCount, dkey)) { + starts[dkey] = MIN(starts[dkey], from_offset); + } else { + starts[dkey] = from_offset; + } + DEBUG_PRINTF("starts[%u]=%llu\n", dkey, starts[dkey]); + + if (offset_adjust) { + deduper->som_log_dirty |= 1; + } else { + deduper->som_log_dirty |= 2; + } + + return DEDUPE_SKIP; + } + } + + return DEDUPE_CONTINUE; +} + +/** \brief Test whether the given key (\a ekey) is set in the exhaustion vector + * \a evec. */ +static really_inline +int isExhausted(const struct RoseEngine *rose, const char *evec, u32 ekey) { + DEBUG_PRINTF("checking exhaustion %p %u\n", evec, ekey); + assert(ekey != INVALID_EKEY); + assert(ekey < rose->ekeyCount); + return mmbit_isset((const u8 *)evec, rose->ekeyCount, ekey); +} + +/** \brief Returns 1 if all exhaustion keys in the bitvector are on. */ +static really_inline +int isAllExhausted(const struct RoseEngine *rose, const char *evec) { + if (!rose->canExhaust) { + return 0; /* pattern set is inexhaustible */ + } + + return mmbit_all((const u8 *)evec, rose->ekeyCount); +} + +/** \brief Mark key \a ekey on in the exhaustion vector. */ +static really_inline +void markAsMatched(const struct RoseEngine *rose, char *evec, u32 ekey) { + DEBUG_PRINTF("marking as exhausted key %u\n", ekey); + assert(ekey != INVALID_EKEY); + assert(ekey < rose->ekeyCount); + mmbit_set((u8 *)evec, rose->ekeyCount, ekey); +} + +/** \brief Clear all keys in the exhaustion vector. */ +static really_inline +void clearEvec(const struct RoseEngine *rose, char *evec) { + DEBUG_PRINTF("clearing evec %p %u\n", evec, rose->ekeyCount); + mmbit_clear((u8 *)evec, rose->ekeyCount); +} + +/** \brief Test whether the given key (\a lkey) is set in the logical vector + * \a lvec. */ +static really_inline +char getLogicalVal(const struct RoseEngine *rose, const char *lvec, u32 lkey) { + DEBUG_PRINTF("checking lkey matching %p %u\n", lvec, lkey); + assert(lkey != INVALID_LKEY); + assert(lkey < rose->lkeyCount + rose->lopCount); + return mmbit_isset((const u8 *)lvec, rose->lkeyCount + rose->lopCount, + lkey); +} + +/** \brief Mark key \a lkey on in the logical vector. */ +static really_inline +void setLogicalVal(const struct RoseEngine *rose, char *lvec, u32 lkey, + char val) { + DEBUG_PRINTF("marking as matched logical key %u\n", lkey); + assert(lkey != INVALID_LKEY); + assert(lkey < rose->lkeyCount + rose->lopCount); + switch (val) { + case 0: + mmbit_unset((u8 *)lvec, rose->lkeyCount + rose->lopCount, lkey); + break; + default: + mmbit_set((u8 *)lvec, rose->lkeyCount + rose->lopCount, lkey); + break; + } +} + +/** \brief Mark key \a ckey on in the combination vector. */ +static really_inline +void setCombinationActive(const struct RoseEngine *rose, char *cvec, u32 ckey) { + DEBUG_PRINTF("marking as active combination key %u\n", ckey); + assert(ckey != INVALID_CKEY); + assert(ckey < rose->ckeyCount); + mmbit_set((u8 *)cvec, rose->ckeyCount, ckey); +} + +/** \brief Returns 1 if compliant to all logical combinations. */ +static really_inline +char isLogicalCombination(const struct RoseEngine *rose, char *lvec, + u32 start, u32 result) { + const struct LogicalOp *logicalTree = (const struct LogicalOp *) + ((const char *)rose + rose->logicalTreeOffset); + assert(start >= rose->lkeyCount); + assert(start <= result); + assert(result < rose->lkeyCount + rose->lopCount); + for (u32 i = start; i <= result; i++) { + const struct LogicalOp *op = logicalTree + (i - rose->lkeyCount); + assert(i == op->id); + assert(op->op <= LAST_LOGICAL_OP); + switch ((enum LogicalOpType)op->op) { + case LOGICAL_OP_NOT: + setLogicalVal(rose, lvec, op->id, + !getLogicalVal(rose, lvec, op->ro)); + break; + case LOGICAL_OP_AND: + setLogicalVal(rose, lvec, op->id, + getLogicalVal(rose, lvec, op->lo) & + getLogicalVal(rose, lvec, op->ro)); // && + break; + case LOGICAL_OP_OR: + setLogicalVal(rose, lvec, op->id, + getLogicalVal(rose, lvec, op->lo) | + getLogicalVal(rose, lvec, op->ro)); // || + break; + } + } + return getLogicalVal(rose, lvec, result); +} + +/** \brief Returns 1 if combination matches when no sub-expression matches. */ +static really_inline +char isPurelyNegativeMatch(const struct RoseEngine *rose, char *lvec, + u32 start, u32 result) { + const struct LogicalOp *logicalTree = (const struct LogicalOp *) + ((const char *)rose + rose->logicalTreeOffset); + assert(start >= rose->lkeyCount); + assert(start <= result); + assert(result < rose->lkeyCount + rose->lopCount); + for (u32 i = start; i <= result; i++) { + const struct LogicalOp *op = logicalTree + (i - rose->lkeyCount); + assert(i == op->id); + assert(op->op <= LAST_LOGICAL_OP); + switch ((enum LogicalOpType)op->op) { + case LOGICAL_OP_NOT: + if ((op->ro < rose->lkeyCount) && + getLogicalVal(rose, lvec, op->ro)) { + // sub-expression not negative + return 0; + } + setLogicalVal(rose, lvec, op->id, + !getLogicalVal(rose, lvec, op->ro)); + break; + case LOGICAL_OP_AND: + if (((op->lo < rose->lkeyCount) && + getLogicalVal(rose, lvec, op->lo)) || + ((op->ro < rose->lkeyCount) && + getLogicalVal(rose, lvec, op->ro))) { + // sub-expression not negative + return 0; + } + setLogicalVal(rose, lvec, op->id, + getLogicalVal(rose, lvec, op->lo) & + getLogicalVal(rose, lvec, op->ro)); // && + break; + case LOGICAL_OP_OR: + if (((op->lo < rose->lkeyCount) && + getLogicalVal(rose, lvec, op->lo)) || + ((op->ro < rose->lkeyCount) && + getLogicalVal(rose, lvec, op->ro))) { + // sub-expression not negative + return 0; + } + setLogicalVal(rose, lvec, op->id, + getLogicalVal(rose, lvec, op->lo) | + getLogicalVal(rose, lvec, op->ro)); // || + break; + } + } + return getLogicalVal(rose, lvec, result); +} + +/** \brief Clear all keys in the logical vector. */ +static really_inline +void clearLvec(const struct RoseEngine *rose, char *lvec, char *cvec) { + DEBUG_PRINTF("clearing lvec %p %u\n", lvec, + rose->lkeyCount + rose->lopCount); + DEBUG_PRINTF("clearing cvec %p %u\n", cvec, rose->ckeyCount); + mmbit_clear((u8 *)lvec, rose->lkeyCount + rose->lopCount); + mmbit_clear((u8 *)cvec, rose->ckeyCount); +} + +/** \brief Clear all keys in the combination vector. */ +static really_inline +void clearCvec(const struct RoseEngine *rose, char *cvec) { + DEBUG_PRINTF("clearing cvec %p %u\n", cvec, rose->ckeyCount); + mmbit_clear((u8 *)cvec, rose->ckeyCount); +} + +/** + * \brief Deliver the given report to the user callback. + * + * Assumes all preconditions (bounds, exhaustion etc) have been checked and + * that dedupe catchup has been done. + */ +static really_inline +int roseDeliverReport(u64a offset, ReportID onmatch, s32 offset_adjust, + struct hs_scratch *scratch, u32 ekey) { + assert(scratch); + assert(scratch->magic == SCRATCH_MAGIC); + + struct core_info *ci = &scratch->core_info; + + u32 flags = 0; +#ifndef RELEASE_BUILD + if (offset_adjust) { + // alert testing tools that we've got adjusted matches + flags |= HS_MATCH_FLAG_ADJUSTED; + } +#endif + + assert(!can_stop_matching(scratch)); + assert(ekey == INVALID_EKEY || + !isExhausted(ci->rose, ci->exhaustionVector, ekey)); + + u64a from_offset = 0; + u64a to_offset = offset + offset_adjust; + + DEBUG_PRINTF(">> reporting match @[%llu,%llu] for sig %u ctxt %p <<\n", + from_offset, to_offset, onmatch, ci->userContext); + + int halt = ci->userCallback(onmatch, from_offset, to_offset, flags, + ci->userContext); + if (halt) { + DEBUG_PRINTF("callback requested to terminate matches\n"); + ci->status |= STATUS_TERMINATED; + return MO_HALT_MATCHING; + } + + if (ekey != INVALID_EKEY) { + markAsMatched(ci->rose, ci->exhaustionVector, ekey); + return MO_CONTINUE_MATCHING; + } else { + return ROSE_CONTINUE_MATCHING_NO_EXHAUST; + } +} + +/** + * \brief Deliver the given SOM report to the user callback. + * + * Assumes all preconditions (bounds, exhaustion etc) have been checked and + * that dedupe catchup has been done. + */ +static really_inline +int roseDeliverSomReport(u64a from_offset, u64a to_offset, ReportID onmatch, + s32 offset_adjust, struct hs_scratch *scratch, + u32 ekey) { + assert(scratch); + assert(scratch->magic == SCRATCH_MAGIC); + + struct core_info *ci = &scratch->core_info; + + u32 flags = 0; +#ifndef RELEASE_BUILD + if (offset_adjust) { + // alert testing tools that we've got adjusted matches + flags |= HS_MATCH_FLAG_ADJUSTED; + } +#endif + + assert(!can_stop_matching(scratch)); + assert(ekey == INVALID_EKEY || + !isExhausted(ci->rose, ci->exhaustionVector, ekey)); + + to_offset += offset_adjust; + assert(from_offset == HS_OFFSET_PAST_HORIZON || from_offset <= to_offset); + + DEBUG_PRINTF(">> reporting match @[%llu,%llu] for sig %u ctxt %p <<\n", + from_offset, to_offset, onmatch, ci->userContext); + + int halt = ci->userCallback(onmatch, from_offset, to_offset, flags, + ci->userContext); + + if (halt) { + DEBUG_PRINTF("callback requested to terminate matches\n"); + ci->status |= STATUS_TERMINATED; + return MO_HALT_MATCHING; + } + + if (ekey != INVALID_EKEY) { + markAsMatched(ci->rose, ci->exhaustionVector, ekey); + return MO_CONTINUE_MATCHING; + } else { + return ROSE_CONTINUE_MATCHING_NO_EXHAUST; + } +} + +#endif // REPORT_H diff --git a/regex/rose/block.c b/regex/rose/block.c new file mode 100644 index 000000000..b3f424cb7 --- /dev/null +++ b/regex/rose/block.c @@ -0,0 +1,422 @@ +/* + * Copyright (c) 2015-2019, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "catchup.h" +#include "init.h" +#include "match.h" +#include "program_runtime.h" +#include "rose.h" +#include "rose_common.h" +#include "nfa/nfa_api.h" +#include "nfa/nfa_internal.h" +#include "nfa/nfa_rev_api.h" +#include "nfa/mcclellan.h" +#include "util/fatbit.h" + +static rose_inline +void runAnchoredTableBlock(const struct RoseEngine *t, const void *atable, + struct hs_scratch *scratch) { + const u8 *buffer = scratch->core_info.buf; + size_t length = scratch->core_info.len; + size_t alen = MIN(length, t->anchoredDistance); + const struct anchored_matcher_info *curr = atable; + + DEBUG_PRINTF("BEGIN ANCHORED (over %zu/%zu)\n", alen, length); + + do { + const struct NFA *nfa + = (const struct NFA *)((const char *)curr + sizeof(*curr)); + + assert(t->anchoredDistance > curr->anchoredMinDistance); + if (length >= curr->anchoredMinDistance) { + size_t local_alen = alen - curr->anchoredMinDistance; + const u8 *local_buffer = buffer + curr->anchoredMinDistance; + + DEBUG_PRINTF("--anchored nfa (+%u)\n", curr->anchoredMinDistance); + assert(isMcClellanType(nfa->type)); + if (nfa->type == MCCLELLAN_NFA_8) { + nfaExecMcClellan8_B(nfa, curr->anchoredMinDistance, + local_buffer, local_alen, + roseAnchoredCallback, scratch); + } else { + nfaExecMcClellan16_B(nfa, curr->anchoredMinDistance, + local_buffer, local_alen, + roseAnchoredCallback, scratch); + } + } + + if (!curr->next_offset) { + break; + } + + curr = (const void *)((const char *)curr + curr->next_offset); + } while (1); +} + +static really_inline +void init_state_for_block(const struct RoseEngine *t, char *state) { + assert(t); + assert(state); + + DEBUG_PRINTF("init for Rose %p with %u state indices\n", t, + t->rolesWithStateCount); + + // Rose is guaranteed 8-aligned state + assert(ISALIGNED_N(state, 8)); + + init_state(t, state); +} + +static really_inline +void init_outfixes_for_block(const struct RoseEngine *t, + struct hs_scratch *scratch, char *state, + char is_small_block) { + /* active leaf array has been cleared by the init scatter */ + + if (t->initMpvNfa != MO_INVALID_IDX) { + assert(t->initMpvNfa == 0); + const struct NFA *nfa = getNfaByQueue(t, 0); + DEBUG_PRINTF("testing minwidth %u > len %zu\n", nfa->minWidth, + scratch->core_info.len); + size_t len = nfaRevAccelCheck(nfa, scratch->core_info.buf, + scratch->core_info.len); + if (len) { + u8 *activeArray = getActiveLeafArray(t, state); + const u32 activeArraySize = t->activeArrayCount; + const u32 qCount = t->queueCount; + + mmbit_set(activeArray, activeArraySize, 0); + fatbit_set(scratch->aqa, qCount, 0); + + struct mq *q = scratch->queues; + initQueue(q, 0, t, scratch); + q->length = len; /* adjust for rev_accel */ + nfaQueueInitState(nfa, q); + pushQueueAt(q, 0, MQE_START, 0); + pushQueueAt(q, 1, MQE_TOP, 0); + } + } + + if (is_small_block && !t->hasOutfixesInSmallBlock) { + DEBUG_PRINTF("all outfixes in small block table\n"); + return; + } + + if (t->outfixBeginQueue != t->outfixEndQueue) { + blockInitSufPQ(t, state, scratch, is_small_block); + } +} + +static really_inline +void init_for_block(const struct RoseEngine *t, struct hs_scratch *scratch, + char *state, char is_small_block) { + init_state_for_block(t, state); + + struct RoseContext *tctxt = &scratch->tctxt; + + tctxt->groups = t->initialGroups; + tctxt->lit_offset_adjust = 1; // index after last byte + tctxt->delayLastEndOffset = 0; + tctxt->lastEndOffset = 0; + tctxt->filledDelayedSlots = 0; + tctxt->lastMatchOffset = 0; + tctxt->lastCombMatchOffset = 0; + tctxt->minMatchOffset = 0; + tctxt->minNonMpvMatchOffset = 0; + tctxt->next_mpv_offset = 0; + + scratch->al_log_sum = 0; + + fatbit_clear(scratch->aqa); + + scratch->catchup_pq.qm_size = 0; + + init_outfixes_for_block(t, scratch, state, is_small_block); +} + +static rose_inline +void roseBlockEodExec(const struct RoseEngine *t, u64a offset, + struct hs_scratch *scratch) { + assert(t->requiresEodCheck); + assert(t->maxBiAnchoredWidth == ROSE_BOUND_INF + || offset <= t->maxBiAnchoredWidth); + + assert(!can_stop_matching(scratch)); + assert(t->eodProgramOffset); + + // Ensure that history is correct before we look for EOD matches. + roseFlushLastByteHistory(t, scratch, offset); + scratch->tctxt.lastEndOffset = offset; + + DEBUG_PRINTF("running eod program at %u\n", t->eodProgramOffset); + + // There should be no pending delayed literals. + assert(!scratch->tctxt.filledDelayedSlots); + + const u64a som = 0; + const u8 flags = ROSE_PROG_FLAG_SKIP_MPV_CATCHUP; + + // Note: we ignore the result, as this is the last thing to ever happen on + // a scan. + roseRunProgram(t, scratch, t->eodProgramOffset, som, offset, flags); +} + +/** + * \brief Run the anchored matcher, if any. Returns non-zero if matching should + * halt. + */ +static rose_inline +int roseBlockAnchored(const struct RoseEngine *t, struct hs_scratch *scratch) { + const void *atable = getALiteralMatcher(t); + if (!atable) { + DEBUG_PRINTF("no anchored table\n"); + return 0; + } + + const size_t length = scratch->core_info.len; + + if (t->amatcherMaxBiAnchoredWidth != ROSE_BOUND_INF && + length > t->amatcherMaxBiAnchoredWidth) { + return 0; + } + + if (length < t->amatcherMinWidth) { + return 0; + } + + runAnchoredTableBlock(t, atable, scratch); + + return can_stop_matching(scratch); +} + +/** + * \brief Run the floating matcher, if any. Returns non-zero if matching should + * halt. + */ +static rose_inline +int roseBlockFloating(const struct RoseEngine *t, struct hs_scratch *scratch) { + const struct HWLM *ftable = getFLiteralMatcher(t); + if (!ftable) { + return 0; + } + + const size_t length = scratch->core_info.len; + char *state = scratch->core_info.state; + struct RoseContext *tctxt = &scratch->tctxt; + + DEBUG_PRINTF("ftable fd=%u fmd %u\n", t->floatingDistance, + t->floatingMinDistance); + if (t->noFloatingRoots && !roseHasInFlightMatches(t, state, scratch)) { + DEBUG_PRINTF("skip FLOATING: no inflight matches\n"); + return 0; + } + + if (t->fmatcherMaxBiAnchoredWidth != ROSE_BOUND_INF && + length > t->fmatcherMaxBiAnchoredWidth) { + return 0; + } + + if (length < t->fmatcherMinWidth) { + return 0; + } + + const u8 *buffer = scratch->core_info.buf; + size_t flen = length; + if (t->floatingDistance != ROSE_BOUND_INF) { + flen = MIN(t->floatingDistance, length); + } + if (flen <= t->floatingMinDistance) { + return 0; + } + + DEBUG_PRINTF("BEGIN FLOATING (over %zu/%zu)\n", flen, length); + DEBUG_PRINTF("-- %016llx\n", tctxt->groups); + hwlmExec(ftable, buffer, flen, t->floatingMinDistance, roseFloatingCallback, + scratch, tctxt->groups & t->floating_group_mask); + + return can_stop_matching(scratch); +} + +static rose_inline +void runEagerPrefixesBlock(const struct RoseEngine *t, + struct hs_scratch *scratch) { + if (!t->eagerIterOffset) { + return; + } + + char *state = scratch->core_info.state; + u8 *ara = getActiveLeftArray(t, state); /* indexed by offsets into + * left_table */ + const u32 arCount = t->activeLeftCount; + const u32 qCount = t->queueCount; + const struct LeftNfaInfo *left_table = getLeftTable(t); + const struct mmbit_sparse_iter *it = getByOffset(t, t->eagerIterOffset); + + struct mmbit_sparse_state si_state[MAX_SPARSE_ITER_STATES]; + + u32 idx = 0; + u32 ri = mmbit_sparse_iter_begin(ara, arCount, &idx, it, si_state); + for (; ri != MMB_INVALID; + ri = mmbit_sparse_iter_next(ara, arCount, ri, &idx, it, si_state)) { + const struct LeftNfaInfo *left = left_table + ri; + u32 qi = ri + t->leftfixBeginQueue; + DEBUG_PRINTF("leftfix %u/%u, maxLag=%u\n", ri, arCount, left->maxLag); + + assert(!fatbit_isset(scratch->aqa, qCount, qi)); + assert(left->eager); + assert(!left->infix); + + struct mq *q = scratch->queues + qi; + const struct NFA *nfa = getNfaByQueue(t, qi); + + if (scratch->core_info.len < nfa->minWidth) { + /* we know that there is not enough data for this to ever match, so + * we can immediately squash/ */ + mmbit_unset(ara, arCount, ri); + scratch->tctxt.groups &= left->squash_mask; + } + + s64a loc = MIN(scratch->core_info.len, EAGER_STOP_OFFSET); + + fatbit_set(scratch->aqa, qCount, qi); + initRoseQueue(t, qi, left, scratch); + + pushQueueAt(q, 0, MQE_START, 0); + pushQueueAt(q, 1, MQE_TOP, 0); + pushQueueAt(q, 2, MQE_END, loc); + nfaQueueInitState(nfa, q); + + char alive = nfaQueueExecToMatch(q->nfa, q, loc); + + if (!alive) { + DEBUG_PRINTF("queue %u dead, squashing\n", qi); + mmbit_unset(ara, arCount, ri); + fatbit_unset(scratch->aqa, qCount, qi); + scratch->tctxt.groups &= left->squash_mask; + } else if (q->cur == q->end) { + assert(alive != MO_MATCHES_PENDING); + if (loc == (s64a)scratch->core_info.len) { + /* We know that the prefix does not match in the block so we + * can squash the groups anyway even though it did not die */ + /* TODO: if we knew the minimum lag the leftfix is checked at we + * could make this check tighter */ + DEBUG_PRINTF("queue %u has no match in block, squashing\n", qi); + mmbit_unset(ara, arCount, ri); + fatbit_unset(scratch->aqa, qCount, qi); + scratch->tctxt.groups &= left->squash_mask; + } else { + DEBUG_PRINTF("queue %u finished, nfa lives\n", qi); + q->cur = q->end = 0; + pushQueueAt(q, 0, MQE_START, loc); + } + } else { + assert(alive == MO_MATCHES_PENDING); + DEBUG_PRINTF("queue %u unfinished, nfa lives\n", qi); + q->end--; /* remove end item */ + } + } +} + +void roseBlockExec(const struct RoseEngine *t, struct hs_scratch *scratch) { + assert(t); + assert(scratch); + assert(scratch->core_info.buf); + assert(mmbit_sparse_iter_state_size(t->rolesWithStateCount) + < MAX_SPARSE_ITER_STATES); + + // We should not have been called if we've already been told to terminate + // matching. + assert(!told_to_stop_matching(scratch)); + + // If this block is shorter than our minimum width, then no pattern in this + // RoseEngine could match. + /* minWidth checks should have already been performed by the caller */ + assert(scratch->core_info.len >= t->minWidth); + + // Similarly, we may have a maximum width (for engines constructed entirely + // of bi-anchored patterns). + /* This check is now handled by the interpreter */ + assert(t->maxBiAnchoredWidth == ROSE_BOUND_INF + || scratch->core_info.len <= t->maxBiAnchoredWidth); + + const size_t length = scratch->core_info.len; + + // We have optimizations for small block scans: we run a single coalesced + // HWLM scan instead of running the anchored and floating matchers. Some + // outfixes are disabled as well (for SEP scans of single-byte literals, + // which are also run in the HWLM scan). + const char is_small_block = + (length < ROSE_SMALL_BLOCK_LEN && t->sbmatcherOffset); + + char *state = scratch->core_info.state; + + init_for_block(t, scratch, state, is_small_block); + + struct RoseContext *tctxt = &scratch->tctxt; + + if (is_small_block) { + const void *sbtable = getSBLiteralMatcher(t); + assert(sbtable); + + size_t sblen = MIN(length, t->smallBlockDistance); + + DEBUG_PRINTF("BEGIN SMALL BLOCK (over %zu/%zu)\n", sblen, length); + DEBUG_PRINTF("-- %016llx\n", tctxt->groups); + hwlmExec(sbtable, scratch->core_info.buf, sblen, 0, roseCallback, + scratch, tctxt->groups); + } else { + runEagerPrefixesBlock(t, scratch); + + if (roseBlockAnchored(t, scratch)) { + return; + } + if (roseBlockFloating(t, scratch)) { + return; + } + } + + if (cleanUpDelayed(t, scratch, length, 0) == HWLM_TERMINATE_MATCHING) { + return; + } + + assert(!can_stop_matching(scratch)); + + roseCatchUpTo(t, scratch, length); + + if (!t->requiresEodCheck || !t->eodProgramOffset) { + DEBUG_PRINTF("no eod check required\n"); + return; + } + + if (can_stop_matching(scratch)) { + DEBUG_PRINTF("bailing, already halted\n"); + return; + } + + roseBlockEodExec(t, length, scratch); +} diff --git a/regex/rose/catchup.c b/regex/rose/catchup.c new file mode 100644 index 000000000..7a6648da9 --- /dev/null +++ b/regex/rose/catchup.c @@ -0,0 +1,900 @@ +/* + * Copyright (c) 2015-2018, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * \file + * \brief Rose runtime: code for catching up output-exposed engines. + */ + +#include "catchup.h" +#include "match.h" +#include "program_runtime.h" +#include "rose.h" +#include "nfa/nfa_rev_api.h" +#include "nfa/mpv.h" +#include "som/som_runtime.h" +#include "util/fatbit.h" +#include "report.h" + +typedef struct queue_match PQ_T; +#define PQ_COMP(pqc_items, a, b) ((pqc_items)[a].loc < (pqc_items)[b].loc) +#define PQ_COMP_B(pqc_items, a, b_fixed) ((pqc_items)[a].loc < (b_fixed).loc) + +#include "util/pqueue.h" + +static really_inline +int roseNfaRunProgram(const struct RoseEngine *rose, struct hs_scratch *scratch, + u64a som, u64a offset, ReportID id, const char from_mpv) { + const u32 program = id; + u8 flags = ROSE_PROG_FLAG_IN_CATCHUP; + if (from_mpv) { + flags |= ROSE_PROG_FLAG_FROM_MPV; + } + + roseRunProgram(rose, scratch, program, som, offset, flags); + + return can_stop_matching(scratch) ? MO_HALT_MATCHING : MO_CONTINUE_MATCHING; +} + +static rose_inline +char roseSuffixInfoIsExhausted(const struct RoseEngine *rose, + const struct NfaInfo *info, + const char *exhausted) { + if (!info->ekeyListOffset) { + return 0; + } + + DEBUG_PRINTF("check exhaustion -> start at %u\n", info->ekeyListOffset); + + /* INVALID_EKEY terminated list */ + const u32 *ekeys = getByOffset(rose, info->ekeyListOffset); + while (*ekeys != INVALID_EKEY) { + DEBUG_PRINTF("check %u\n", *ekeys); + if (!isExhausted(rose, exhausted, *ekeys)) { + DEBUG_PRINTF("not exhausted -> alive\n"); + return 0; + } + ++ekeys; + } + + DEBUG_PRINTF("all ekeys exhausted -> dead\n"); + return 1; +} + +static really_inline +char roseSuffixIsExhausted(const struct RoseEngine *rose, u32 qi, + const char *exhausted) { + DEBUG_PRINTF("check queue %u\n", qi); + const struct NfaInfo *info = getNfaInfoByQueue(rose, qi); + return roseSuffixInfoIsExhausted(rose, info, exhausted); +} + +static really_inline +void deactivateQueue(const struct RoseEngine *t, u8 *aa, u32 qi, + struct hs_scratch *scratch) { + u32 aaCount = t->activeArrayCount; + u32 qCount = t->queueCount; + + /* this is sailing close to the wind with regards to invalidating an + * iteration. We are saved by the fact that unsetting does not clear the + * summary bits -> the block under the gun remains valid + */ + DEBUG_PRINTF("killing off zombie queue %u\n", qi); + mmbit_unset(aa, aaCount, qi); + fatbit_unset(scratch->aqa, qCount, qi); +} + +static really_inline +void ensureQueueActive(const struct RoseEngine *t, u32 qi, u32 qCount, + struct mq *q, struct hs_scratch *scratch) { + if (!fatbit_set(scratch->aqa, qCount, qi)) { + DEBUG_PRINTF("initing %u\n", qi); + initQueue(q, qi, t, scratch); + loadStreamState(q->nfa, q, 0); + pushQueueAt(q, 0, MQE_START, 0); + } +} + +static really_inline +void pq_replace_top_with(struct catchup_pq *pq, + UNUSED struct hs_scratch *scratch, u32 queue, + s64a loc) { + DEBUG_PRINTF("inserting q%u in pq at %lld\n", queue, loc); + struct queue_match temp = { + .queue = queue, + .loc = (size_t)loc + }; + + assert(loc > 0); + assert(pq->qm_size); + assert(loc <= (s64a)scratch->core_info.len); + pq_replace_top(pq->qm, pq->qm_size, temp); +} + +static really_inline +void pq_insert_with(struct catchup_pq *pq, + UNUSED struct hs_scratch *scratch, u32 queue, s64a loc) { + DEBUG_PRINTF("inserting q%u in pq at %lld\n", queue, loc); + struct queue_match temp = { + .queue = queue, + .loc = (size_t)loc + }; + + assert(loc > 0); + assert(loc <= (s64a)scratch->core_info.len); + pq_insert(pq->qm, pq->qm_size, temp); + ++pq->qm_size; +} + +static really_inline +void pq_pop_nice(struct catchup_pq *pq) { + pq_pop(pq->qm, pq->qm_size); + pq->qm_size--; +} + +static really_inline +s64a pq_top_loc(struct catchup_pq *pq) { + assert(pq->qm_size); + return (s64a)pq_top(pq->qm)->loc; +} + +/* requires that we are the top item on the pq */ +static really_inline +hwlmcb_rv_t runExistingNfaToNextMatch(const struct RoseEngine *t, u32 qi, + struct mq *q, s64a loc, + struct hs_scratch *scratch, u8 *aa, + char report_curr) { + assert(pq_top(scratch->catchup_pq.qm)->queue == qi); + assert(scratch->catchup_pq.qm_size); + assert(!q->report_current); + if (report_curr) { + DEBUG_PRINTF("need to report matches\n"); + q->report_current = 1; + } + + DEBUG_PRINTF("running queue from %u:%lld to %lld\n", q->cur, q_cur_loc(q), + loc); + + assert(q_cur_loc(q) <= loc); + + char alive = nfaQueueExecToMatch(q->nfa, q, loc); + + /* exit via gift shop */ + if (alive == MO_MATCHES_PENDING) { + /* we have pending matches */ + assert(q_cur_loc(q) + scratch->core_info.buf_offset + >= scratch->tctxt.minMatchOffset); + pq_replace_top_with(&scratch->catchup_pq, scratch, qi, q_cur_loc(q)); + return HWLM_CONTINUE_MATCHING; + } else if (!alive) { + if (report_curr && can_stop_matching(scratch)) { + DEBUG_PRINTF("bailing\n"); + return HWLM_TERMINATE_MATCHING; + } + + deactivateQueue(t, aa, qi, scratch); + } else if (q->cur == q->end) { + DEBUG_PRINTF("queue %u finished, nfa lives\n", qi); + q->cur = q->end = 0; + pushQueueAt(q, 0, MQE_START, loc); + } else { + DEBUG_PRINTF("queue %u unfinished, nfa lives\n", qi); + u32 i = 0; + while (q->cur < q->end) { + q->items[i] = q->items[q->cur++]; + DEBUG_PRINTF("q[%u] = %u:%lld\n", i, q->items[i].type, + q->items[i].location); + assert(q->items[i].type != MQE_END); + i++; + } + q->cur = 0; + q->end = i; + } + + pq_pop_nice(&scratch->catchup_pq); + + return HWLM_CONTINUE_MATCHING; +} + +static really_inline +hwlmcb_rv_t runNewNfaToNextMatch(const struct RoseEngine *t, u32 qi, + struct mq *q, s64a loc, + struct hs_scratch *scratch, u8 *aa, + s64a report_ok_loc) { + assert(!q->report_current); + DEBUG_PRINTF("running queue from %u:%lld to %lld\n", q->cur, q_cur_loc(q), + loc); + DEBUG_PRINTF("min match offset %llu\n", scratch->tctxt.minMatchOffset); + + char alive = 1; + +restart: + alive = nfaQueueExecToMatch(q->nfa, q, loc); + + if (alive == MO_MATCHES_PENDING) { + DEBUG_PRINTF("we have pending matches at %lld\n", q_cur_loc(q)); + s64a qcl = q_cur_loc(q); + + if (qcl == report_ok_loc) { + assert(q->cur != q->end); /* the queue shouldn't be empty if there + * are pending matches. */ + q->report_current = 1; + DEBUG_PRINTF("restarting...\n"); + goto restart; + } + assert(qcl + scratch->core_info.buf_offset + >= scratch->tctxt.minMatchOffset); + pq_insert_with(&scratch->catchup_pq, scratch, qi, qcl); + } else if (!alive) { + if (can_stop_matching(scratch)) { + DEBUG_PRINTF("bailing\n"); + return HWLM_TERMINATE_MATCHING; + } + + deactivateQueue(t, aa, qi, scratch); + } else if (q->cur == q->end) { + DEBUG_PRINTF("queue %u finished, nfa lives\n", qi); + q->cur = q->end = 0; + pushQueueAt(q, 0, MQE_START, loc); + } else { + DEBUG_PRINTF("queue %u unfinished, nfa lives\n", qi); + u32 i = 0; + while (q->cur < q->end) { + q->items[i] = q->items[q->cur++]; + DEBUG_PRINTF("q[%u] = %u:%lld\n", i, q->items[i].type, + q->items[i].location); + assert(q->items[i].type != MQE_END); + i++; + } + q->cur = 0; + q->end = i; + } + + return HWLM_CONTINUE_MATCHING; +} + +/* for use by mpv (chained) only */ +static +int roseNfaFinalBlastAdaptor(u64a start, u64a end, ReportID id, void *context) { + struct hs_scratch *scratch = context; + assert(scratch && scratch->magic == SCRATCH_MAGIC); + const struct RoseEngine *t = scratch->core_info.rose; + + DEBUG_PRINTF("id=%u matched at [%llu,%llu]\n", id, start, end); + + int cb_rv = roseNfaRunProgram(t, scratch, start, end, id, 1); + if (cb_rv == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } else if (cb_rv == ROSE_CONTINUE_MATCHING_NO_EXHAUST) { + return MO_CONTINUE_MATCHING; + } else { + assert(cb_rv == MO_CONTINUE_MATCHING); + return !roseSuffixIsExhausted(t, 0, + scratch->core_info.exhaustionVector); + } +} + +static really_inline +void ensureEnd(struct mq *q, UNUSED u32 qi, s64a final_loc) { + DEBUG_PRINTF("ensure MQE_END %lld for queue %u\n", final_loc, qi); + if (final_loc >= q_last_loc(q)) { + /* TODO: ensure situation does not arise */ + assert(q_last_type(q) != MQE_END); + pushQueueNoMerge(q, MQE_END, final_loc); + } +} + +static really_inline +hwlmcb_rv_t add_to_queue(const struct RoseEngine *t, struct mq *queues, + u32 qCount, u8 *aa, struct hs_scratch *scratch, + s64a loc, u32 qi, s64a report_ok_loc) { + struct mq *q = queues + qi; + const struct NfaInfo *info = getNfaInfoByQueue(t, qi); + + if (roseSuffixInfoIsExhausted(t, info, + scratch->core_info.exhaustionVector)) { + deactivateQueue(t, aa, qi, scratch); + return HWLM_CONTINUE_MATCHING; + } + + ensureQueueActive(t, qi, qCount, q, scratch); + + if (unlikely(loc < q_cur_loc(q))) { + DEBUG_PRINTF("err loc %lld < location %lld\n", loc, q_cur_loc(q)); + return HWLM_CONTINUE_MATCHING; + } + + ensureEnd(q, qi, loc); + + return runNewNfaToNextMatch(t, qi, q, loc, scratch, aa, report_ok_loc); +} + +static really_inline +s64a findSecondPlace(struct catchup_pq *pq, s64a loc_limit) { + assert(pq->qm_size); /* we are still on the pq and we are first place */ + + /* we know (*cough* encapsulation) that second place will either be in + * pq->qm[1] or pq->qm[2] (we are pq->qm[0]) */ + switch (pq->qm_size) { + case 0: + case 1: + return (s64a)loc_limit; + case 2: + return MIN((s64a)pq->qm[1].loc, loc_limit); + default:; + size_t best = MIN(pq->qm[1].loc, pq->qm[2].loc); + return MIN((s64a)best, loc_limit); + } +} + +hwlmcb_rv_t roseCatchUpMPV_i(const struct RoseEngine *t, s64a loc, + struct hs_scratch *scratch) { + char *state = scratch->core_info.state; + struct mq *queues = scratch->queues; + u8 *aa = getActiveLeafArray(t, state); + UNUSED u32 aaCount = t->activeArrayCount; + u32 qCount = t->queueCount; + + /* find first match of each pending nfa */ + DEBUG_PRINTF("aa=%p, aaCount=%u\n", aa, aaCount); + + assert(t->outfixBeginQueue == 1); + + u32 qi = 0; + assert(mmbit_isset(aa, aaCount, 0)); /* caller should have already bailed */ + + DEBUG_PRINTF("catching up qi=%u to loc %lld\n", qi, loc); + + struct mq *q = queues + qi; + const struct NfaInfo *info = getNfaInfoByQueue(t, qi); + u64a mpv_exec_end = scratch->core_info.buf_offset + loc; + u64a next_pos_match_loc = 0; + + if (roseSuffixInfoIsExhausted(t, info, + scratch->core_info.exhaustionVector)) { + deactivateQueue(t, aa, qi, scratch); + goto done; + } + + ensureQueueActive(t, qi, qCount, q, scratch); + + if (unlikely(loc < q_cur_loc(q))) { + DEBUG_PRINTF("err loc %lld < location %lld\n", loc, q_cur_loc(q)); + goto done; + } + + ensureEnd(q, qi, loc); + + assert(!q->report_current); + + q->cb = roseNfaFinalBlastAdaptor; + + DEBUG_PRINTF("queue %u blasting, %u/%u [%lld/%lld]\n", + qi, q->cur, q->end, q->items[q->cur].location, loc); + + scratch->tctxt.mpv_inactive = 0; + + /* we know it is going to be an mpv, skip the indirection */ + next_pos_match_loc = nfaExecMpv_QueueExecRaw(q->nfa, q, loc); + assert(!q->report_current); + + if (!next_pos_match_loc) { /* 0 means dead */ + DEBUG_PRINTF("mpv is pining for the fjords\n"); + if (can_stop_matching(scratch)) { + deactivateQueue(t, aa, qi, scratch); + return HWLM_TERMINATE_MATCHING; + } + + next_pos_match_loc = scratch->core_info.len; + scratch->tctxt.mpv_inactive = 1; + } + + if (q->cur == q->end) { + DEBUG_PRINTF("queue %u finished, nfa lives [%lld]\n", qi, loc); + q->cur = 0; + q->end = 0; + pushQueueAt(q, 0, MQE_START, loc); + } else { + DEBUG_PRINTF("queue %u not finished, nfa lives [%lld]\n", qi, loc); + } + +done: + if (t->flushCombProgramOffset) { + if (roseRunFlushCombProgram(t, scratch, mpv_exec_end) + == HWLM_TERMINATE_MATCHING) { + return HWLM_TERMINATE_MATCHING; + } + } + updateMinMatchOffsetFromMpv(&scratch->tctxt, mpv_exec_end); + scratch->tctxt.next_mpv_offset + = MAX(next_pos_match_loc + scratch->core_info.buf_offset, + mpv_exec_end + 1); + + DEBUG_PRINTF("next match loc %lld (off %llu)\n", next_pos_match_loc, + scratch->tctxt.next_mpv_offset); + return can_stop_matching(scratch) ? HWLM_TERMINATE_MATCHING + : HWLM_CONTINUE_MATCHING; +} + +static really_inline +char in_mpv(const struct RoseEngine *rose, const struct hs_scratch *scratch) { + const struct RoseContext *tctxt = &scratch->tctxt; + assert(tctxt->curr_qi < rose->queueCount); + if (tctxt->curr_qi < rose->outfixBeginQueue) { + assert(getNfaByQueue(rose, tctxt->curr_qi)->type == MPV_NFA); + return 1; + } + return 0; +} + +static +int roseNfaBlastAdaptor(u64a start, u64a end, ReportID id, void *context) { + struct hs_scratch *scratch = context; + assert(scratch && scratch->magic == SCRATCH_MAGIC); + const struct RoseEngine *t = scratch->core_info.rose; + + DEBUG_PRINTF("id=%u matched at [%llu,%llu]\n", id, start, end); + + const char from_mpv = in_mpv(t, scratch); + int cb_rv = roseNfaRunProgram(t, scratch, start, end, id, from_mpv); + if (cb_rv == MO_HALT_MATCHING) { + return MO_HALT_MATCHING; + } else if (cb_rv == ROSE_CONTINUE_MATCHING_NO_EXHAUST) { + return MO_CONTINUE_MATCHING; + } else { + assert(cb_rv == MO_CONTINUE_MATCHING); + return !roseSuffixIsExhausted(t, scratch->tctxt.curr_qi, + scratch->core_info.exhaustionVector); + } +} + +int roseNfaAdaptor(u64a start, u64a end, ReportID id, void *context) { + struct hs_scratch *scratch = context; + assert(scratch && scratch->magic == SCRATCH_MAGIC); + + DEBUG_PRINTF("id=%u matched at [%llu,%llu]\n", id, start, end); + + /* must be a external report as haig cannot directly participate in chain */ + return roseNfaRunProgram(scratch->core_info.rose, scratch, start, end, id, + 0); +} + +static really_inline +char blast_queue(struct hs_scratch *scratch, struct mq *q, u32 qi, s64a to_loc, + char report_current) { + scratch->tctxt.curr_qi = qi; + q->cb = roseNfaBlastAdaptor; + q->report_current = report_current; + DEBUG_PRINTF("queue %u blasting, %u/%u [%lld/%lld]\n", qi, q->cur, q->end, + q_cur_loc(q), to_loc); + char alive = nfaQueueExec(q->nfa, q, to_loc); + q->cb = roseNfaAdaptor; + assert(!q->report_current); + + return alive; +} + +static really_inline +hwlmcb_rv_t buildSufPQ_final(const struct RoseEngine *t, s64a report_ok_loc, + s64a second_place_loc, s64a final_loc, + struct hs_scratch *scratch, u8 *aa, u32 a_qi) { + struct mq *q = scratch->queues + a_qi; + const struct NfaInfo *info = getNfaInfoByQueue(t, a_qi); + DEBUG_PRINTF("blasting qi=%u to %lld [final %lld]\n", a_qi, second_place_loc, + final_loc); + + if (roseSuffixInfoIsExhausted(t, info, + scratch->core_info.exhaustionVector)) { + deactivateQueue(t, aa, a_qi, scratch); + return HWLM_CONTINUE_MATCHING; + } + + ensureQueueActive(t, a_qi, t->queueCount, q, scratch); + + if (unlikely(final_loc < q_cur_loc(q))) { + DEBUG_PRINTF("err loc %lld < location %lld\n", final_loc, q_cur_loc(q)); + return HWLM_CONTINUE_MATCHING; + } + + ensureEnd(q, a_qi, final_loc); + + char alive = blast_queue(scratch, q, a_qi, second_place_loc, 0); + + /* We have three possible outcomes: + * (1) the nfa died + * (2) we completed the queue (implies that second_place_loc == final_loc) + * (3) the queue ran to second_place_loc and stopped. In this case we need + * to find the next match location. + */ + + if (!alive) { + if (can_stop_matching(scratch)) { + DEBUG_PRINTF("roseCatchUpNfas done as bailing\n"); + return HWLM_TERMINATE_MATCHING; + } + + deactivateQueue(t, aa, a_qi, scratch); + } else if (q->cur == q->end) { + DEBUG_PRINTF("queue %u finished, nfa lives [%lld]\n", a_qi, final_loc); + + assert(second_place_loc == final_loc); + + q->cur = q->end = 0; + pushQueueAt(q, 0, MQE_START, final_loc); + } else { + DEBUG_PRINTF("queue %u not finished, %u/%u [%lld/%lld]\n", a_qi, q->cur, + q->end, q_cur_loc(q), final_loc); + DEBUG_PRINTF("finding next match location\n"); + + assert(second_place_loc < final_loc); + assert(q_cur_loc(q) >= second_place_loc); + + if (runNewNfaToNextMatch(t, a_qi, q, final_loc, scratch, aa, + report_ok_loc) == HWLM_TERMINATE_MATCHING) { + DEBUG_PRINTF("roseCatchUpNfas done\n"); + return HWLM_TERMINATE_MATCHING; + } + } + + return HWLM_CONTINUE_MATCHING; +} + +void streamInitSufPQ(const struct RoseEngine *t, char *state, + struct hs_scratch *scratch) { + assert(scratch->catchup_pq.qm_size == 0); + assert(t->outfixBeginQueue != t->outfixEndQueue); + + DEBUG_PRINTF("initSufPQ: outfixes [%u,%u)\n", t->outfixBeginQueue, + t->outfixEndQueue); + + u32 qCount = t->queueCount; + u8 *aa = getActiveLeafArray(t, state); + u32 aaCount = t->activeArrayCount; + struct mq *queues = scratch->queues; + size_t length = scratch->core_info.len; + + u32 qi = mmbit_iterate_bounded(aa, aaCount, t->outfixBeginQueue, + t->outfixEndQueue); + for (; qi < t->outfixEndQueue;) { + DEBUG_PRINTF("adding qi=%u\n", qi); + struct mq *q = queues + qi; + + ensureQueueActive(t, qi, qCount, q, scratch); + ensureEnd(q, qi, length); + + char alive = nfaQueueExecToMatch(q->nfa, q, length); + + if (alive == MO_MATCHES_PENDING) { + DEBUG_PRINTF("we have pending matches at %lld\n", q_cur_loc(q)); + s64a qcl = q_cur_loc(q); + + pq_insert_with(&scratch->catchup_pq, scratch, qi, qcl); + } else if (!alive) { + deactivateQueue(t, aa, qi, scratch); + } else { + assert(q->cur == q->end); + /* TODO: can this be simplified? the nfa will never produce any + * matches for this block. */ + DEBUG_PRINTF("queue %u finished, nfa lives\n", qi); + q->cur = q->end = 0; + pushQueueAt(q, 0, MQE_START, length); + } + + qi = mmbit_iterate_bounded(aa, aaCount, qi + 1, t->outfixEndQueue); + } +} + +void blockInitSufPQ(const struct RoseEngine *t, char *state, + struct hs_scratch *scratch, char is_small_block) { + DEBUG_PRINTF("initSufPQ: outfixes [%u,%u)\n", t->outfixBeginQueue, + t->outfixEndQueue); + + assert(scratch->catchup_pq.qm_size == 0); + assert(t->outfixBeginQueue != t->outfixEndQueue); + + struct mq *queues = scratch->queues; + u8 *aa = getActiveLeafArray(t, state); + struct fatbit *aqa = scratch->aqa; + u32 aaCount = t->activeArrayCount; + u32 qCount = t->queueCount; + size_t length = scratch->core_info.len; + + for (u32 qi = t->outfixBeginQueue; qi < t->outfixEndQueue; qi++) { + const struct NfaInfo *info = getNfaInfoByQueue(t, qi); + + if (is_small_block && info->in_sbmatcher) { + DEBUG_PRINTF("skip outfix %u as it's in the SB matcher\n", qi); + continue; + } + + const struct NFA *nfa = getNfaByInfo(t, info); + DEBUG_PRINTF("testing minwidth %u > len %zu\n", nfa->minWidth, + length); + size_t len = nfaRevAccelCheck(nfa, scratch->core_info.buf, length); + if (!len) { + continue; + } + mmbit_set(aa, aaCount, qi); + fatbit_set(aqa, qCount, qi); + struct mq *q = queues + qi; + initQueue(q, qi, t, scratch); + q->length = len; /* adjust for rev_accel */ + nfaQueueInitState(nfa, q); + pushQueueAt(q, 0, MQE_START, 0); + pushQueueAt(q, 1, MQE_TOP, 0); + pushQueueAt(q, 2, MQE_END, length); + + DEBUG_PRINTF("adding qi=%u to pq\n", qi); + + char alive = nfaQueueExecToMatch(q->nfa, q, length); + + if (alive == MO_MATCHES_PENDING) { + DEBUG_PRINTF("we have pending matches at %lld\n", q_cur_loc(q)); + s64a qcl = q_cur_loc(q); + + pq_insert_with(&scratch->catchup_pq, scratch, qi, qcl); + } else if (!alive) { + deactivateQueue(t, aa, qi, scratch); + } else { + assert(q->cur == q->end); + /* TODO: can this be simplified? the nfa will never produce any + * matches for this block. */ + DEBUG_PRINTF("queue %u finished, nfa lives\n", qi); + q->cur = q->end = 0; + pushQueueAt(q, 0, MQE_START, length); + } + } +} + +/** + * safe_loc is ??? + */ +static rose_inline +hwlmcb_rv_t buildSufPQ(const struct RoseEngine *t, char *state, s64a safe_loc, + s64a final_loc, struct hs_scratch *scratch) { + assert(scratch->catchup_pq.qm_size <= t->outfixEndQueue); + + struct RoseContext *tctxt = &scratch->tctxt; + assert(t->activeArrayCount); + + assert(scratch->core_info.buf_offset + final_loc + > tctxt->minNonMpvMatchOffset); + DEBUG_PRINTF("buildSufPQ final loc %lld (safe %lld)\n", final_loc, + safe_loc); + assert(safe_loc <= final_loc); + + u8 *aa = getActiveLeafArray(t, state); + u32 aaCount = t->activeArrayCount; + + /* find first match of each pending nfa */ + DEBUG_PRINTF("aa=%p, aaCount=%u\n", aa, aaCount); + + /* Note: mpv MUST not participate in the main priority queue as + * they may have events pushed on during this process which may be before + * the catch up point. Outfixes are remain in the pq between catchup events + * as they never have any incoming events to worry about. + */ + if (aaCount == t->outfixEndQueue) { + return HWLM_CONTINUE_MATCHING; + } + + DEBUG_PRINTF("mib %u/%u\n", t->outfixBeginQueue, aaCount); + + u32 a_qi = mmbit_iterate_bounded(aa, aaCount, t->outfixEndQueue, aaCount); + + if (a_qi == MMB_INVALID) { + return HWLM_CONTINUE_MATCHING; + } + + s64a report_ok_loc = tctxt->minNonMpvMatchOffset + 1 + - scratch->core_info.buf_offset; + + hwlmcb_rv_t rv = roseCatchUpMPV(t, report_ok_loc, scratch); + if (rv != HWLM_CONTINUE_MATCHING) { + DEBUG_PRINTF("terminating...\n"); + return rv; + } + + while (a_qi != MMB_INVALID) { + DEBUG_PRINTF("catching up qi=%u to %lld\n", a_qi, final_loc); + u32 n_qi = mmbit_iterate(aa, aaCount, a_qi); + + s64a second_place_loc + = scratch->catchup_pq.qm_size ? pq_top_loc(&scratch->catchup_pq) + : safe_loc; + second_place_loc = MIN(second_place_loc, safe_loc); + if (n_qi == MMB_INVALID && report_ok_loc <= second_place_loc) { + if (buildSufPQ_final(t, report_ok_loc, second_place_loc, final_loc, + scratch, aa, a_qi) + == HWLM_TERMINATE_MATCHING) { + return HWLM_TERMINATE_MATCHING; + } + break; + } + + if (add_to_queue(t, scratch->queues, t->queueCount, aa, scratch, + final_loc, a_qi, report_ok_loc) + == HWLM_TERMINATE_MATCHING) { + DEBUG_PRINTF("roseCatchUpNfas done\n"); + return HWLM_TERMINATE_MATCHING; + } + + a_qi = n_qi; + } + + DEBUG_PRINTF("PQ BUILD %u items\n", scratch->catchup_pq.qm_size); + return HWLM_CONTINUE_MATCHING; +} + +static never_inline +hwlmcb_rv_t roseCatchUpNfas(const struct RoseEngine *t, s64a loc, + s64a final_loc, struct hs_scratch *scratch) { + assert(t->activeArrayCount); + + DEBUG_PRINTF("roseCatchUpNfas offset=%llu + %lld/%lld\n", + scratch->core_info.buf_offset, loc, final_loc); + DEBUG_PRINTF("min non mpv match offset %llu\n", + scratch->tctxt.minNonMpvMatchOffset); + + struct RoseContext *tctxt = &scratch->tctxt; + assert(scratch->core_info.buf_offset + loc >= tctxt->minNonMpvMatchOffset); + + char *state = scratch->core_info.state; + struct mq *queues = scratch->queues; + u8 *aa = getActiveLeafArray(t, state); + + /* fire off earliest nfa match and catchup anchored matches to that point */ + while (scratch->catchup_pq.qm_size) { + s64a match_loc = pq_top_loc(&scratch->catchup_pq); + u32 qi = pq_top(scratch->catchup_pq.qm)->queue; + + DEBUG_PRINTF("winrar q%u@%lld loc %lld\n", qi, match_loc, loc); + assert(match_loc + scratch->core_info.buf_offset + >= scratch->tctxt.minNonMpvMatchOffset); + + if (match_loc > loc) { + /* we have processed all the matches at or before rose's current + * location; only things remaining on the pq should be outfixes. */ + DEBUG_PRINTF("saving for later\n"); + goto exit; + } + + /* catch up char matches to this point */ + if (roseCatchUpMPV(t, match_loc, scratch) + == HWLM_TERMINATE_MATCHING) { + DEBUG_PRINTF("roseCatchUpNfas done\n"); + return HWLM_TERMINATE_MATCHING; + } + + assert(match_loc + scratch->core_info.buf_offset + >= scratch->tctxt.minNonMpvMatchOffset); + + struct mq *q = queues + qi; + + /* outfixes must be advanced all the way as they persist in the pq + * between catchup events */ + s64a q_final_loc = qi >= t->outfixEndQueue ? final_loc + : (s64a)scratch->core_info.len; + + /* fire nfa matches, and find next place this nfa match */ + DEBUG_PRINTF("reporting matches %u@%llu [q->cur %u/%u]\n", qi, + match_loc, q->cur, q->end); + + /* we then need to catch this nfa up to next earliest nfa match. These + * matches can be fired directly from the callback. The callback needs + * to ensure that the anchored matches remain in sync though */ + s64a second_place_loc = findSecondPlace(&scratch->catchup_pq, loc); + DEBUG_PRINTF("second place %lld loc %lld\n", second_place_loc, loc); + + if (second_place_loc == q_cur_loc(q)) { + if (runExistingNfaToNextMatch(t, qi, q, q_final_loc, scratch, aa, 1) + == HWLM_TERMINATE_MATCHING) { + return HWLM_TERMINATE_MATCHING; + } + continue; + } + + char alive = blast_queue(scratch, q, qi, second_place_loc, 1); + + if (!alive) { + if (can_stop_matching(scratch)) { + DEBUG_PRINTF("roseCatchUpNfas done as bailing\n"); + return HWLM_TERMINATE_MATCHING; + } + + deactivateQueue(t, aa, qi, scratch); + pq_pop_nice(&scratch->catchup_pq); + } else if (q->cur == q->end) { + DEBUG_PRINTF("queue %u finished, nfa lives [%lld]\n", qi, loc); + q->cur = q->end = 0; + pushQueueAt(q, 0, MQE_START, loc); + pq_pop_nice(&scratch->catchup_pq); + } else if (second_place_loc == q_final_loc) { + DEBUG_PRINTF("queue %u on hold\n", qi); + pq_pop_nice(&scratch->catchup_pq); + break; + } else { + DEBUG_PRINTF("queue %u not finished, %u/%u [%lld/%lld]\n", + qi, q->cur, q->end, q->items[q->cur].location, loc); + runExistingNfaToNextMatch(t, qi, q, q_final_loc, scratch, aa, 0); + } + } +exit:; + tctxt->minNonMpvMatchOffset = scratch->core_info.buf_offset + loc; + DEBUG_PRINTF("roseCatchUpNfas done\n"); + return HWLM_CONTINUE_MATCHING; +} + +hwlmcb_rv_t roseCatchUpAll(s64a loc, struct hs_scratch *scratch) { + /* just need suf/outfixes and mpv */ + DEBUG_PRINTF("loc %lld mnmmo %llu mmo %llu\n", loc, + scratch->tctxt.minNonMpvMatchOffset, + scratch->tctxt.minMatchOffset); + assert(scratch->core_info.buf_offset + loc + > scratch->tctxt.minNonMpvMatchOffset); + + const struct RoseEngine *t = scratch->core_info.rose; + char *state = scratch->core_info.state; + + hwlmcb_rv_t rv = buildSufPQ(t, state, loc, loc, scratch); + if (rv != HWLM_CONTINUE_MATCHING) { + return rv; + } + + rv = roseCatchUpNfas(t, loc, loc, scratch); + if (rv != HWLM_CONTINUE_MATCHING) { + return rv; + } + + rv = roseCatchUpMPV(t, loc, scratch); + assert(rv != HWLM_CONTINUE_MATCHING + || scratch->catchup_pq.qm_size <= t->outfixEndQueue); + assert(!can_stop_matching(scratch) || rv == HWLM_TERMINATE_MATCHING); + return rv; +} + +hwlmcb_rv_t roseCatchUpSuf(s64a loc, struct hs_scratch *scratch) { + /* just need suf/outfixes. mpv will be caught up only to last reported + * external match */ + assert(scratch->core_info.buf_offset + loc + > scratch->tctxt.minNonMpvMatchOffset); + + const struct RoseEngine *t = scratch->core_info.rose; + char *state = scratch->core_info.state; + + hwlmcb_rv_t rv = buildSufPQ(t, state, loc, loc, scratch); + if (rv != HWLM_CONTINUE_MATCHING) { + return rv; + } + + rv = roseCatchUpNfas(t, loc, loc, scratch); + assert(rv != HWLM_CONTINUE_MATCHING || + scratch->catchup_pq.qm_size <= t->outfixEndQueue); + + return rv; +} diff --git a/regex/rose/catchup.h b/regex/rose/catchup.h new file mode 100644 index 000000000..8188d5af0 --- /dev/null +++ b/regex/rose/catchup.h @@ -0,0 +1,207 @@ +/* + * Copyright (c) 2015-2018, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * \file + * \brief Rose runtime: code for catching up output-exposed engines. + * + * Rose has several components which run behind the main (floating table) clock + * and need to be caught up before we report matches. + * + * Currently we have to deal with: + * 1. Suffix/Outfix NFAs + * 2. A single MPV NFA (chained), which may also be triggered by (1). + * + * The approach is to: + * - (A) build a priority queue of the suffix/outfixes based on their first + * match location; + * - (B) process the matches from the priority queue in order; + * - (C) As we report matches from (B) we interleave matches from the MPV if it + * exists. + */ + +#ifndef ROSE_CATCHUP_H +#define ROSE_CATCHUP_H + +#include "hwlm/hwlm.h" +#include "runtime.h" +#include "scratch.h" +#include "rose.h" +#include "rose_common.h" +#include "rose_internal.h" +#include "ue2common.h" +#include "util/multibit.h" + +hwlmcb_rv_t roseCatchUpAll(s64a loc, struct hs_scratch *scratch); + +/* will only catch mpv up to last reported external match */ +hwlmcb_rv_t roseCatchUpSuf(s64a loc, struct hs_scratch *scratch); + +hwlmcb_rv_t roseCatchUpMPV_i(const struct RoseEngine *t, s64a loc, + struct hs_scratch *scratch); + +void blockInitSufPQ(const struct RoseEngine *t, char *state, + struct hs_scratch *scratch, char is_small_block); +void streamInitSufPQ(const struct RoseEngine *t, char *state, + struct hs_scratch *scratch); + +static really_inline +int canSkipCatchUpMPV(const struct RoseEngine *t, struct hs_scratch *scratch, + u64a cur_offset) { + if (!has_chained_nfas(t)) { + return 1; + } + + /* note: we may have to run at less than tctxt.minMatchOffset as we may + * have a full queue of postponed events that we need to flush */ + if (cur_offset < scratch->tctxt.next_mpv_offset) { + DEBUG_PRINTF("skipping cur_offset %llu min %llu, mpv %llu\n", + cur_offset, scratch->tctxt.minMatchOffset, + scratch->tctxt.next_mpv_offset); + return 1; + } + + assert(t->activeArrayCount); + + DEBUG_PRINTF("cur offset offset: %llu\n", cur_offset); + DEBUG_PRINTF("min match offset %llu\n", scratch->tctxt.minMatchOffset); + + assert(t->outfixBeginQueue == 1); /* if it exists mpv is queue 0 */ + + const u8 *aa = getActiveLeafArray(t, scratch->core_info.state); + return !mmbit_isset(aa, t->activeArrayCount, 0); +} + +/** \brief Catches up the MPV. */ +static really_inline +hwlmcb_rv_t roseCatchUpMPV(const struct RoseEngine *t, s64a loc, + struct hs_scratch *scratch) { + u64a cur_offset = loc + scratch->core_info.buf_offset; + assert(cur_offset >= scratch->tctxt.minMatchOffset); + assert(!can_stop_matching(scratch)); + + if (canSkipCatchUpMPV(t, scratch, cur_offset)) { + if (t->flushCombProgramOffset) { + if (roseRunFlushCombProgram(t, scratch, cur_offset) + == HWLM_TERMINATE_MATCHING) { + return HWLM_TERMINATE_MATCHING; + } + } + updateMinMatchOffsetFromMpv(&scratch->tctxt, cur_offset); + return HWLM_CONTINUE_MATCHING; + } + + /* Note: chained tails MUST not participate in the priority queue as + * they may have events pushed on during this process which may be before + * the catch up point */ + + return roseCatchUpMPV_i(t, loc, scratch); +} + +/** \brief Catches up NFAs and the MPV. */ +static rose_inline +hwlmcb_rv_t roseCatchUpTo(const struct RoseEngine *t, + struct hs_scratch *scratch, u64a end) { + /* no need to catch up if we are at the same offset as last time */ + if (end <= scratch->tctxt.minMatchOffset) { + /* we must already be up to date */ + DEBUG_PRINTF("skip\n"); + return HWLM_CONTINUE_MATCHING; + } + + char *state = scratch->core_info.state; + s64a loc = end - scratch->core_info.buf_offset; + + if (end <= scratch->tctxt.minNonMpvMatchOffset) { + /* only need to catch up the mpv */ + return roseCatchUpMPV(t, loc, scratch); + } + + assert(scratch->tctxt.minMatchOffset >= scratch->core_info.buf_offset); + hwlmcb_rv_t rv; + if (!t->activeArrayCount + || !mmbit_any(getActiveLeafArray(t, state), t->activeArrayCount)) { + if (t->flushCombProgramOffset) { + if (roseRunFlushCombProgram(t, scratch, end) + == HWLM_TERMINATE_MATCHING) { + return HWLM_TERMINATE_MATCHING; + } + } + updateMinMatchOffset(&scratch->tctxt, end); + rv = HWLM_CONTINUE_MATCHING; + } else { + rv = roseCatchUpAll(loc, scratch); + } + + assert(rv != HWLM_CONTINUE_MATCHING + || scratch->tctxt.minMatchOffset == end); + assert(rv != HWLM_CONTINUE_MATCHING + || scratch->tctxt.minNonMpvMatchOffset == end); + assert(!can_stop_matching(scratch) || rv == HWLM_TERMINATE_MATCHING); + return rv; +} + +/** + * \brief Catches up anything which may add triggers on the MPV (suffixes and + * outfixes). + * + * The MPV will be run only to intersperse matches in the output match stream + * if external matches are raised. + */ +static rose_inline +hwlmcb_rv_t roseCatchUpMpvFeeders(const struct RoseEngine *t, + struct hs_scratch *scratch, u64a end) { + /* no need to catch up if we are at the same offset as last time */ + if (end <= scratch->tctxt.minNonMpvMatchOffset) { + /* we must already be up to date */ + DEBUG_PRINTF("skip\n"); + return HWLM_CONTINUE_MATCHING; + } + + s64a loc = end - scratch->core_info.buf_offset; + + assert(t->activeArrayCount); /* mpv is in active array */ + assert(scratch->tctxt.minMatchOffset >= scratch->core_info.buf_offset); + + if (!t->mpvTriggeredByLeaf) { + /* no need to check as they never put triggers onto the mpv */ + return HWLM_CONTINUE_MATCHING; + } + + /* sadly, this branch rarely gets taken as the mpv itself is usually + * alive. */ + char *state = scratch->core_info.state; + if (!mmbit_any(getActiveLeafArray(t, state), t->activeArrayCount)) { + scratch->tctxt.minNonMpvMatchOffset = end; + return HWLM_CONTINUE_MATCHING; + } + + return roseCatchUpSuf(loc, scratch); +} + +#endif diff --git a/regex/rose/counting_miracle.h b/regex/rose/counting_miracle.h new file mode 100644 index 000000000..976208b73 --- /dev/null +++ b/regex/rose/counting_miracle.h @@ -0,0 +1,263 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef ROSE_COUNTING_MIRACLE_H +#define ROSE_COUNTING_MIRACLE_H + +#include "ue2common.h" +#include "runtime.h" +#include "rose_internal.h" +#include "nfa/nfa_api_queue.h" +#include "util/simd_utils.h" + +/** \brief Maximum number of bytes to scan when looking for a "counting miracle" + * stop character. */ +#define COUNTING_MIRACLE_LEN_MAX 256 + +static really_inline +char roseCountingMiracleScan(u8 c, const u8 *d, const u8 *d_end, + u32 target_count, u32 *count_inout, + const u8 **d_out) { + assert(d <= d_end); + + u32 count = *count_inout; + + m128 chars = set16x8(c); + + for (; d + 16 <= d_end; d_end -= 16) { + m128 data = loadu128(d_end - 16); + u32 z1 = movemask128(eq128(chars, data)); + count += popcount32(z1); + + if (count >= target_count) { + *d_out = d_end - 16; + *count_inout = count; + return 1; + } + } + + if (d != d_end) { + char temp[sizeof(m128)]; + assert(d + sizeof(temp) > d_end); + memset(temp, c + 1, sizeof(temp)); + memcpy(temp, d, d_end - d); + m128 data = loadu128(temp); + u32 z1 = movemask128(eq128(chars, data)); + count += popcount32(z1); + + if (count >= target_count) { + *d_out = d; + *count_inout = count; + return 1; + } + } + + *count_inout = count; + return 0; +} + +#define GET_LO_4(chars) and128(chars, low4bits) +#define GET_HI_4(chars) rshift64_m128(andnot128(low4bits, chars), 4) + +static really_inline +u32 roseCountingMiracleScanShufti(m128 mask_lo, m128 mask_hi, u8 poison, + const u8 *d, const u8 *d_end, + u32 target_count, u32 *count_inout, + const u8 **d_out) { + assert(d <= d_end); + + u32 count = *count_inout; + + const m128 zeroes = zeroes128(); + const m128 low4bits = _mm_set1_epi8(0xf); + + for (; d + 16 <= d_end; d_end -= 16) { + m128 data = loadu128(d_end - 16); + m128 c_lo = pshufb_m128(mask_lo, GET_LO_4(data)); + m128 c_hi = pshufb_m128(mask_hi, GET_HI_4(data)); + m128 t = and128(c_lo, c_hi); + u32 z1 = movemask128(eq128(t, zeroes)); + count += popcount32(z1 ^ 0xffff); + + if (count >= target_count) { + *d_out = d_end - 16; + *count_inout = count; + return 1; + } + } + + if (d != d_end) { + char temp[sizeof(m128)]; + assert(d + sizeof(temp) > d_end); + memset(temp, poison, sizeof(temp)); + memcpy(temp, d, d_end - d); + m128 data = loadu128(temp); + m128 c_lo = pshufb_m128(mask_lo, GET_LO_4(data)); + m128 c_hi = pshufb_m128(mask_hi, GET_HI_4(data)); + m128 t = and128(c_lo, c_hi); + u32 z1 = movemask128(eq128(t, zeroes)); + count += popcount32(z1 ^ 0xffff); + + if (count >= target_count) { + *d_out = d; + *count_inout = count; + return 1; + } + } + + *count_inout = count; + return 0; +} + +/** + * \brief "Counting Miracle" scan: If we see more than N instances of a + * particular character class we know that the engine must be dead. + * + * Scans the buffer/history between relative locations \a begin_loc and \a + * end_loc, and returns a miracle location (if any) that appears in the stream + * after \a begin_loc. + * + * Returns 1 if some bytes can be skipped and sets \a miracle_loc + * appropriately, 0 otherwise. + */ +static never_inline +int roseCountingMiracleOccurs(const struct RoseEngine *t, + const struct LeftNfaInfo *left, + const struct core_info *ci, s64a begin_loc, + const s64a end_loc, s64a *miracle_loc) { + if (!left->countingMiracleOffset) { + return 0; + } + + const struct RoseCountingMiracle *cm + = (const void *)((const char *)t + left->countingMiracleOffset); + + assert(!left->transient); + assert(cm->count > 1); /* should be a normal miracle then */ + + DEBUG_PRINTF("looking for counting miracle over [%lld,%lld], maxLag=%u\n", + begin_loc, end_loc, left->maxLag); + DEBUG_PRINTF("ci->len=%zu, ci->hlen=%zu\n", ci->len, ci->hlen); + + assert(begin_loc <= end_loc); + assert(begin_loc >= -(s64a)ci->hlen); + assert(end_loc <= (s64a)ci->len); + + const s64a scan_end_loc = end_loc - left->maxLag; + if (scan_end_loc <= begin_loc) { + DEBUG_PRINTF("nothing to scan\n"); + return 0; + } + + const s64a start = MAX(begin_loc, scan_end_loc - COUNTING_MIRACLE_LEN_MAX); + DEBUG_PRINTF("scan [%lld..%lld]\n", start, scan_end_loc); + + u32 count = 0; + + s64a m_loc = start; + + if (!cm->shufti) { + u8 c = cm->c; + + // Scan buffer. + const s64a buf_scan_start = MAX(0, start); + if (scan_end_loc > buf_scan_start) { + const u8 *buf = ci->buf; + const u8 *d = buf + scan_end_loc; + const u8 *d_start = buf + buf_scan_start; + const u8 *d_out; + if (roseCountingMiracleScan(c, d_start, d, cm->count, &count, + &d_out)) { + assert(d_out >= d_start); + m_loc = (d_out - d_start) + buf_scan_start; + goto success; + } + } + + // Scan history. + if (start < 0) { + const u8 *hbuf_end = ci->hbuf + ci->hlen; + const u8 *d = hbuf_end + MIN(0, scan_end_loc); + const u8 *d_start = hbuf_end + start; + const u8 *d_out; + if (roseCountingMiracleScan(c, d_start, d, cm->count, &count, + &d_out)) { + assert(d_out >= d_start); + m_loc = (d_out - d_start) + start; + goto success; + } + } + } else { + m128 lo = cm->lo; + m128 hi = cm->hi; + u8 poison = cm->poison; + + // Scan buffer. + const s64a buf_scan_start = MAX(0, start); + if (scan_end_loc > buf_scan_start) { + const u8 *buf = ci->buf; + const u8 *d = buf + scan_end_loc; + const u8 *d_start = buf + buf_scan_start; + const u8 *d_out; + if (roseCountingMiracleScanShufti(lo, hi, poison, d_start, d, + cm->count, &count, &d_out)) { + assert(d_out >= d_start); + m_loc = (d_out - d_start) + buf_scan_start; + goto success; + } + } + + // Scan history. + if (start < 0) { + const u8 *hbuf_end = ci->hbuf + ci->hlen; + const u8 *d = hbuf_end + MIN(0, scan_end_loc); + const u8 *d_start = hbuf_end + start; + const u8 *d_out; + if (roseCountingMiracleScanShufti(lo, hi, poison, d_start, d, + cm->count, &count, &d_out)) { + assert(d_out >= d_start); + m_loc = (d_out - d_start) + start; + goto success; + } + } + } + + DEBUG_PRINTF("found %u/%u\n", count, cm->count); + return 0; + +success: + DEBUG_PRINTF("found %u/%u\n", count, cm->count); + assert(count >= cm->count); + assert(m_loc < scan_end_loc); + assert(m_loc >= start); + + *miracle_loc = m_loc; + return 1; +} + +#endif diff --git a/regex/rose/infix.h b/regex/rose/infix.h new file mode 100644 index 000000000..9cf9c0ad7 --- /dev/null +++ b/regex/rose/infix.h @@ -0,0 +1,161 @@ +/* + * Copyright (c) 2015-2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef INFIX_H +#define INFIX_H + +#include "ue2common.h" +#include "nfa/nfa_api.h" +#include "nfa/nfa_api_queue.h" +#include "nfa/nfa_internal.h" + +static really_inline +int infixTooOld(struct mq *q, s64a curr_loc) { + u32 maxAge = q->nfa->maxWidth; + + if (!maxAge) { + return 0; + } + + return q_last_loc(q) + maxAge < curr_loc; +} + +static really_inline +int canReduceQueue(const struct mq *q, s64a curr_loc, u32 maxTops, u32 maxAge) { + u32 qlen = q->end - q->cur; /* includes MQE_START */ + + if (maxAge && q->items[q->cur].location + maxAge < curr_loc) { + return 1; + } + + if (qlen - 1 > maxTops) { + return 1; + } + + if (qlen - 1 == maxTops + && q->items[q->cur].location != q->items[q->cur + 1].location) { + /* we can advance start to the first top location */ + return 1; + } + + return 0; +} + +/** + * Removes tops which are known not to affect the final state from the queue. + * May also reinitialise the engine state if it is unneeded. + * + * maxAge is the maximum width of the infix. Any tops/state before this can be + * ignored. 0 is used to indicate that there is no upper bound on the width of + * the pattern. + * + * maxTops is the maximum number of locations of tops that can affect the top. + * It is only possible for the last maxTops tops to affect the final state - + * earlier ones can be safely removed. Also, any state before the max tops may + * be ignored. + * + * This code assumes/requires that there are not multiple tops at the same + * location in the queue. This code also assumes that it is not a multitop + * engine. + */ +static really_inline +void reduceInfixQueue(struct mq *q, s64a curr_loc, u32 maxTops, u32 maxAge) { + assert(q->end > q->cur); + assert(maxTops); + u32 qlen = q->end - q->cur; /* includes MQE_START */ + DEBUG_PRINTF("q=%p, len=%u, maxTops=%u maxAge=%u\n", q, qlen, maxTops, + maxAge); + + if (!canReduceQueue(q, curr_loc, maxTops, maxAge)) { + DEBUG_PRINTF("nothing to do\n"); + return; + } + +#ifdef DEBUG + debugQueue(q); +#endif + + char drop_state = qlen - 1 >= maxTops + || (maxAge && q->items[q->cur].location + maxAge < curr_loc); + + LIMIT_TO_AT_MOST(&maxTops, qlen - 1); + + // We leave our START where it is, at the front of the queue. + assert(q->items[q->cur].type == MQE_START); + + // We want to shuffle maxQueueLen items from the end of the queue to just + // after the start, effectively dequeuing old items. We could use memmove + // for this, but it's probably not a good idea to take the cost of the + // function call. + const struct mq_item *src = &q->items[q->cur + qlen - maxTops]; + + q->items[0] = q->items[q->cur]; /* shift start event to 0 slot */ + q->cur = 0; + q->end = 1; + struct mq_item *dst = &q->items[1]; + u32 i = 0; + if (maxAge) { + /* any event which is older than maxAge can be dropped */ + for (; i < maxTops; i++, src++) { + if (src->location >= curr_loc - maxAge) { + break; + } + } + } + + for (; i < maxTops; i++) { + *dst = *src; + src++; + dst++; + q->end++; + } + + if (drop_state) { + /* clear state and shift start up to first top */ + s64a new_loc; + if (q->end > 1) { + new_loc = q->items[1].location; + } else { + DEBUG_PRINTF("no tops\n"); + new_loc = curr_loc; + } + + DEBUG_PRINTF("advancing start from %lld to %lld\n", + q->items[0].location, new_loc); + assert(new_loc > q->items[0].location); + q->items[0].location = new_loc; + nfaQueueInitState(q->nfa, q); + } + + DEBUG_PRINTF("reduced queue to len=%u\n", q->end - q->cur); +#ifdef DEBUG + debugQueue(q); +#endif +} + +#endif diff --git a/regex/rose/init.c b/regex/rose/init.c new file mode 100644 index 000000000..761024d1a --- /dev/null +++ b/regex/rose/init.c @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2015-2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "init.h" +#include "match.h" +#include "runtime.h" +#include "scratch.h" +#include "rose.h" +#include "rose_common.h" +#include "rose_internal.h" +#include "ue2common.h" +#include "nfa/mcclellan.h" +#include "nfa/nfa_api_util.h" +#include "nfa/nfa_internal.h" +#include "util/multibit.h" + +#ifndef __KERNEL__ +#include +#else +#include +#endif + +static really_inline +void init_rstate(const struct RoseEngine *t, char *state) { + // Set runtime state: we take our initial groups from the RoseEngine. + DEBUG_PRINTF("setting initial groups to 0x%016llx\n", t->initialGroups); + storeGroups(t, state, t->initialGroups); +} + +static really_inline +void init_outfixes(const struct RoseEngine *t, char *state) { + /* The active leaf array has been init'ed by the scatter with outfix + * bits set on */ + + // Init the NFA state for each outfix. + for (u32 qi = t->outfixBeginQueue; qi < t->outfixEndQueue; qi++) { + const struct NfaInfo *info = getNfaInfoByQueue(t, qi); + const struct NFA *nfa = getNfaByInfo(t, info); + nfaInitCompressedState(nfa, 0, state + info->stateOffset, + 0 /* assume NUL at start */); + } + + if (t->initMpvNfa != MO_INVALID_IDX) { + const struct NfaInfo *info = getNfaInfoByQueue(t, t->initMpvNfa); + const struct NFA *nfa = getNfaByInfo(t, info); + nfaInitCompressedState(nfa, 0, state + info->stateOffset, + 0 /* assume NUL at start */); + mmbit_set(getActiveLeafArray(t, state), t->activeArrayCount, + t->initMpvNfa); + } +} + +void roseInitState(const struct RoseEngine *t, char *state) { + assert(t); + assert(state); + + DEBUG_PRINTF("init for Rose %p with %u state indices)\n", t, + t->rolesWithStateCount); + + // Rose is guaranteed 8-aligned state + assert(ISALIGNED_N(state, 8)); + + init_rstate(t, state); + + init_state(t, state); + init_outfixes(t, state); +} diff --git a/regex/rose/init.h b/regex/rose/init.h new file mode 100644 index 000000000..b37053b26 --- /dev/null +++ b/regex/rose/init.h @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2015-2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef ROSE_INIT_H +#define ROSE_INIT_H + +#include "rose_internal.h" +#include "ue2common.h" +#include "util/scatter_runtime.h" + +/* + * Initialisation code common to streaming mode Rose (init.c) and block mode + * Rose (block.c) code. + */ + +static really_inline +void init_state(const struct RoseEngine *t, char *state) { + scatter(state, t, &t->state_init); +} + +#endif // ROSE_INIT_H diff --git a/regex/rose/match.c b/regex/rose/match.c new file mode 100644 index 000000000..023db3860 --- /dev/null +++ b/regex/rose/match.c @@ -0,0 +1,632 @@ +/* + * Copyright (c) 2015-2019, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "catchup.h" +#include "match.h" +#include "program_runtime.h" +#include "rose.h" +#include "util/bitutils.h" +#include "util/fatbit.h" + +#if defined(DEBUG) || defined(DUMP_SUPPORT) +#include "util/compare.h" +/** A debugging crutch: print a hex-escaped version of the match for our + * perusal. The start and end offsets are stream offsets. */ +static UNUSED +void printMatch(const struct core_info *ci, u64a start, u64a end) { + assert(start <= end); + assert(end <= ci->buf_offset + ci->len); + + DEBUG_PRINTF("'"); + u64a i = start; + for (; i <= MIN(ci->buf_offset, end); i++) { + u64a h_idx = ci->buf_offset - i; + u8 c = h_idx >= ci->hlen ? '?' : ci->hbuf[ci->hlen - h_idx - 1]; + if (ourisprint(c) && c != '\'') { + DEBUG_PRINTF("%c", c); + } else { + DEBUG_PRINTF("\\x%02x", c); + } + } + for (; i <= end; i++) { + u64a b_idx = i - ci->buf_offset - 1; + u8 c = b_idx >= ci->len ? '?' : ci->buf[b_idx]; + if (ourisprint(c) && c != '\'') { + DEBUG_PRINTF("%c", c); + } else { + DEBUG_PRINTF("\\x%02x", c); + } + } + DEBUG_PRINTF("'"); +} +#endif + +hwlmcb_rv_t roseDelayRebuildCallback(size_t end, u32 id, + struct hs_scratch *scratch) { + struct RoseContext *tctx = &scratch->tctxt; + struct core_info *ci = &scratch->core_info; + const struct RoseEngine *t = ci->rose; + size_t rb_len = MIN(ci->hlen, t->delayRebuildLength); + + u64a real_end = ci->buf_offset - rb_len + end + 1; // index after last byte + +#ifdef DEBUG + DEBUG_PRINTF("REBUILD MATCH id=%u end offset@%llu]: ", id, real_end); + u64a start = real_end < 8 ? 1 : real_end - 7; + printMatch(ci, start, real_end); + DEBUG_PRINTF("\n"); +#endif + + DEBUG_PRINTF("STATE groups=0x%016llx\n", tctx->groups); + + assert(id && id < t->size); // id is a program offset + const u64a som = 0; + const u8 flags = 0; + UNUSED hwlmcb_rv_t rv = + roseRunProgram(t, scratch, id, som, real_end, flags); + assert(rv != HWLM_TERMINATE_MATCHING); + + /* we are just repopulating the delay queue, groups should be + * already set from the original scan. */ + + return tctx->groups; +} + +static really_inline +hwlmcb_rv_t ensureMpvQueueFlushed(const struct RoseEngine *t, + struct hs_scratch *scratch, u32 qi, s64a loc, + char in_chained) { + return ensureQueueFlushed_i(t, scratch, qi, loc, 1, in_chained); +} + +hwlmcb_rv_t roseHandleChainMatch(const struct RoseEngine *t, + struct hs_scratch *scratch, u32 event, + u64a top_squash_distance, u64a end, + char in_catchup) { + assert(event == MQE_TOP || event >= MQE_TOP_FIRST); + struct core_info *ci = &scratch->core_info; + + u8 *aa = getActiveLeafArray(t, scratch->core_info.state); + u32 aaCount = t->activeArrayCount; + struct fatbit *activeQueues = scratch->aqa; + u32 qCount = t->queueCount; + + const u32 qi = 0; /* MPV is always queue 0 if it exists */ + struct mq *q = &scratch->queues[qi]; + const struct NfaInfo *info = getNfaInfoByQueue(t, qi); + + s64a loc = (s64a)end - ci->buf_offset; + assert(loc <= (s64a)ci->len && loc >= -(s64a)ci->hlen); + + if (!mmbit_set(aa, aaCount, qi)) { + initQueue(q, qi, t, scratch); + nfaQueueInitState(q->nfa, q); + pushQueueAt(q, 0, MQE_START, loc); + fatbit_set(activeQueues, qCount, qi); + } else if (info->no_retrigger) { + DEBUG_PRINTF("yawn\n"); + /* nfa only needs one top; we can go home now */ + return HWLM_CONTINUE_MATCHING; + } else if (!fatbit_set(activeQueues, qCount, qi)) { + initQueue(q, qi, t, scratch); + loadStreamState(q->nfa, q, 0); + pushQueueAt(q, 0, MQE_START, 0); + } else if (isQueueFull(q)) { + DEBUG_PRINTF("queue %u full -> catching up nfas\n", qi); + /* we know it is a chained nfa and the suffixes/outfixes must already + * be known to be consistent */ + if (ensureMpvQueueFlushed(t, scratch, qi, loc, in_catchup) + == HWLM_TERMINATE_MATCHING) { + DEBUG_PRINTF("terminating...\n"); + return HWLM_TERMINATE_MATCHING; + } + } + + if (top_squash_distance) { + assert(q->cur < q->end); + struct mq_item *last = &q->items[q->end - 1]; + if (last->type == event + && last->location >= loc - (s64a)top_squash_distance) { + last->location = loc; + goto event_enqueued; + } + } + + pushQueue(q, event, loc); + +event_enqueued: + if (q_cur_loc(q) == (s64a)ci->len) { + /* we may not run the nfa; need to ensure state is fine */ + DEBUG_PRINTF("empty run\n"); + pushQueueNoMerge(q, MQE_END, loc); + char alive = nfaQueueExec(q->nfa, q, loc); + if (alive) { + scratch->tctxt.mpv_inactive = 0; + q->cur = q->end = 0; + pushQueueAt(q, 0, MQE_START, loc); + } else { + mmbit_unset(aa, aaCount, qi); + fatbit_unset(scratch->aqa, qCount, qi); + } + } + + DEBUG_PRINTF("added mpv event at %lld\n", loc); + scratch->tctxt.next_mpv_offset = 0; /* the top event may result in matches + * earlier than expected */ + return HWLM_CONTINUE_MATCHING; +} + +int roseAnchoredCallback(u64a start, u64a end, u32 id, void *ctx) { + struct hs_scratch *scratch = ctx; + assert(scratch && scratch->magic == SCRATCH_MAGIC); + struct RoseContext *tctxt = &scratch->tctxt; + struct core_info *ci = &scratch->core_info; + const struct RoseEngine *t = ci->rose; + + u64a real_end = ci->buf_offset + end; // index after last byte + + DEBUG_PRINTF("MATCH id=%u offsets=[???,%llu]\n", id, real_end); + DEBUG_PRINTF("STATE groups=0x%016llx\n", tctxt->groups); + + if (can_stop_matching(scratch)) { + DEBUG_PRINTF("received a match when we're already dead!\n"); + return MO_HALT_MATCHING; + } + + /* delayed literals need to be delivered before real literals; however + * delayed literals only come from the floating table so if we are going + * to deliver a literal here it must be too early for a delayed literal */ + + /* no history checks from anchored region and we are before the flush + * boundary */ + + if (real_end <= t->floatingMinLiteralMatchOffset) { + roseFlushLastByteHistory(t, scratch, real_end); + tctxt->lastEndOffset = real_end; + } + + // Note that the "id" we have been handed is the program offset. + const u8 flags = ROSE_PROG_FLAG_IN_ANCHORED; + if (roseRunProgram(t, scratch, id, start, real_end, flags) + == HWLM_TERMINATE_MATCHING) { + assert(can_stop_matching(scratch)); + DEBUG_PRINTF("caller requested termination\n"); + return MO_HALT_MATCHING; + } + + DEBUG_PRINTF("DONE groups=0x%016llx\n", tctxt->groups); + + return MO_CONTINUE_MATCHING; +} + +/** + * \brief Run the program for the given literal ID, with the interpreter + * inlined into this call. + * + * Assumes not in_anchored. + */ +static really_inline +hwlmcb_rv_t roseProcessMatchInline(const struct RoseEngine *t, + struct hs_scratch *scratch, u64a end, + u32 id) { + DEBUG_PRINTF("id=%u\n", id); + const u64a som = 0; + const u8 flags = 0; + if (t->pureLiteral) { + return roseRunProgram_l(t, scratch, id, som, end, flags); + } else { + return roseRunProgram(t, scratch, id, som, end, flags); + } +} + +static rose_inline +hwlmcb_rv_t playDelaySlot(const struct RoseEngine *t, + struct hs_scratch *scratch, + struct fatbit **delaySlots, u32 vicIndex, + u64a offset) { + /* assert(!tctxt->in_anchored); */ + assert(vicIndex < DELAY_SLOT_COUNT); + const struct fatbit *vicSlot = delaySlots[vicIndex]; + u32 delay_count = t->delay_count; + + if (offset < t->floatingMinLiteralMatchOffset) { + DEBUG_PRINTF("too soon\n"); + return HWLM_CONTINUE_MATCHING; + } + + struct RoseContext *tctxt = &scratch->tctxt; + roseFlushLastByteHistory(t, scratch, offset); + tctxt->lastEndOffset = offset; + + const u32 *programs = getByOffset(t, t->delayProgramOffset); + + for (u32 it = fatbit_iterate(vicSlot, delay_count, MMB_INVALID); + it != MMB_INVALID; it = fatbit_iterate(vicSlot, delay_count, it)) { + UNUSED rose_group old_groups = tctxt->groups; + + DEBUG_PRINTF("DELAYED MATCH id=%u offset=%llu\n", it, offset); + const u64a som = 0; + const u8 flags = 0; + hwlmcb_rv_t rv = roseRunProgram(t, scratch, programs[it], som, offset, + flags); + DEBUG_PRINTF("DONE groups=0x%016llx\n", tctxt->groups); + + /* delayed literals can't safely set groups. + * However we may be setting groups that successors already have + * worked out that we don't need to match the group */ + DEBUG_PRINTF("groups in %016llx out %016llx\n", old_groups, + tctxt->groups); + + if (rv == HWLM_TERMINATE_MATCHING) { + return HWLM_TERMINATE_MATCHING; + } + } + + return HWLM_CONTINUE_MATCHING; +} + +static really_inline +hwlmcb_rv_t flushAnchoredLiteralAtLoc(const struct RoseEngine *t, + struct hs_scratch *scratch, + u32 curr_loc) { + struct RoseContext *tctxt = &scratch->tctxt; + struct fatbit *curr_row = getAnchoredLiteralLog(scratch)[curr_loc - 1]; + u32 region_width = t->anchored_count; + + const u32 *programs = getByOffset(t, t->anchoredProgramOffset); + + DEBUG_PRINTF("report matches at curr loc\n"); + for (u32 it = fatbit_iterate(curr_row, region_width, MMB_INVALID); + it != MMB_INVALID; it = fatbit_iterate(curr_row, region_width, it)) { + DEBUG_PRINTF("it = %u/%u\n", it, region_width); + + rose_group old_groups = tctxt->groups; + DEBUG_PRINTF("ANCH REPLAY MATCH id=%u offset=%u\n", it, curr_loc); + const u64a som = 0; + const u8 flags = 0; + hwlmcb_rv_t rv = roseRunProgram(t, scratch, programs[it], som, curr_loc, + flags); + DEBUG_PRINTF("DONE groups=0x%016llx\n", tctxt->groups); + + /* anchored literals can't safely set groups. + * However we may be setting groups that successors already + * have worked out that we don't need to match the group */ + DEBUG_PRINTF("groups in %016llx out %016llx\n", old_groups, + tctxt->groups); + tctxt->groups &= old_groups; + + if (rv == HWLM_TERMINATE_MATCHING) { + return HWLM_TERMINATE_MATCHING; + } + } + + /* clear row; does not invalidate iteration */ + bf64_unset(&scratch->al_log_sum, curr_loc - 1); + + return HWLM_CONTINUE_MATCHING; +} + +static really_inline +u32 anchored_it_begin(struct hs_scratch *scratch) { + struct RoseContext *tctxt = &scratch->tctxt; + if (tctxt->lastEndOffset >= scratch->anchored_literal_region_len) { + return MMB_INVALID; + } + u32 begin = tctxt->lastEndOffset; + begin--; + + return bf64_iterate(scratch->al_log_sum, begin); +} + +static really_inline +hwlmcb_rv_t flushAnchoredLiterals(const struct RoseEngine *t, + struct hs_scratch *scratch, + u32 *anchored_it_param, u64a to_off) { + struct RoseContext *tctxt = &scratch->tctxt; + u32 anchored_it = *anchored_it_param; + /* catch up any remaining anchored matches */ + for (; anchored_it != MMB_INVALID && anchored_it < to_off; + anchored_it = bf64_iterate(scratch->al_log_sum, anchored_it)) { + assert(anchored_it < scratch->anchored_literal_region_len); + DEBUG_PRINTF("loc_it = %u\n", anchored_it); + u32 curr_off = anchored_it + 1; + roseFlushLastByteHistory(t, scratch, curr_off); + tctxt->lastEndOffset = curr_off; + + if (flushAnchoredLiteralAtLoc(t, scratch, curr_off) + == HWLM_TERMINATE_MATCHING) { + return HWLM_TERMINATE_MATCHING; + } + } + + *anchored_it_param = anchored_it; + return HWLM_CONTINUE_MATCHING; +} + +static really_inline +hwlmcb_rv_t playVictims(const struct RoseEngine *t, struct hs_scratch *scratch, + u32 *anchored_it, u64a lastEnd, u64a victimDelaySlots, + struct fatbit **delaySlots) { + while (victimDelaySlots) { + u32 vic = findAndClearLSB_64(&victimDelaySlots); + DEBUG_PRINTF("vic = %u\n", vic); + u64a vicOffset = vic + (lastEnd & ~(u64a)DELAY_MASK); + + if (flushAnchoredLiterals(t, scratch, anchored_it, vicOffset) + == HWLM_TERMINATE_MATCHING) { + return HWLM_TERMINATE_MATCHING; + } + + if (playDelaySlot(t, scratch, delaySlots, vic % DELAY_SLOT_COUNT, + vicOffset) == HWLM_TERMINATE_MATCHING) { + return HWLM_TERMINATE_MATCHING; + } + } + + return HWLM_CONTINUE_MATCHING; +} + +/* call flushQueuedLiterals instead */ +hwlmcb_rv_t flushQueuedLiterals_i(const struct RoseEngine *t, + struct hs_scratch *scratch, u64a currEnd) { + struct RoseContext *tctxt = &scratch->tctxt; + u64a lastEnd = tctxt->delayLastEndOffset; + DEBUG_PRINTF("flushing backed up matches @%llu up from %llu\n", currEnd, + lastEnd); + + assert(currEnd != lastEnd); /* checked in main entry point */ + + u32 anchored_it = anchored_it_begin(scratch); + + if (!tctxt->filledDelayedSlots) { + DEBUG_PRINTF("no delayed, no flush\n"); + goto anchored_leftovers; + } + + { + struct fatbit **delaySlots = getDelaySlots(scratch); + + u32 lastIndex = lastEnd & DELAY_MASK; + u32 currIndex = currEnd & DELAY_MASK; + + int wrapped = (lastEnd | DELAY_MASK) < currEnd; + + u64a victimDelaySlots; /* needs to be twice as wide as the number of + * slots. */ + + DEBUG_PRINTF("hello %08x\n", tctxt->filledDelayedSlots); + if (!wrapped) { + victimDelaySlots = tctxt->filledDelayedSlots; + + DEBUG_PRINTF("unwrapped %016llx %08x\n", victimDelaySlots, + tctxt->filledDelayedSlots); + /* index vars < 32 so 64bit shifts are safe */ + + /* clear all slots at last index and below, */ + victimDelaySlots &= ~((1LLU << (lastIndex + 1)) - 1); + + /* clear all slots above curr index */ + victimDelaySlots &= (1LLU << (currIndex + 1)) - 1; + + tctxt->filledDelayedSlots &= ~victimDelaySlots; + + DEBUG_PRINTF("unwrapped %016llx %08x\n", victimDelaySlots, + tctxt->filledDelayedSlots); + } else { + DEBUG_PRINTF("wrapped %08x\n", tctxt->filledDelayedSlots); + + /* 1st half: clear all slots at last index and below, */ + u64a first_half = tctxt->filledDelayedSlots; + first_half &= ~((1ULL << (lastIndex + 1)) - 1); + tctxt->filledDelayedSlots &= (1ULL << (lastIndex + 1)) - 1; + + u64a second_half = tctxt->filledDelayedSlots; + + if (currEnd > lastEnd + DELAY_SLOT_COUNT) { + /* 2nd half: clear all slots above last index */ + second_half &= (1ULL << (lastIndex + 1)) - 1; + } else { + /* 2nd half: clear all slots above curr index */ + second_half &= (1ULL << (currIndex + 1)) - 1; + } + tctxt->filledDelayedSlots &= ~second_half; + + victimDelaySlots = first_half | (second_half << DELAY_SLOT_COUNT); + + DEBUG_PRINTF("-- %016llx %016llx = %016llx (li %u)\n", first_half, + second_half, victimDelaySlots, lastIndex); + } + + if (playVictims(t, scratch, &anchored_it, lastEnd, victimDelaySlots, + delaySlots) == HWLM_TERMINATE_MATCHING) { + return HWLM_TERMINATE_MATCHING; + } + } + +anchored_leftovers:; + hwlmcb_rv_t rv = flushAnchoredLiterals(t, scratch, &anchored_it, currEnd); + tctxt->delayLastEndOffset = currEnd; + return rv; +} + +static really_inline +hwlmcb_rv_t roseCallback_i(size_t end, u32 id, struct hs_scratch *scratch) { + struct RoseContext *tctx = &scratch->tctxt; + const struct RoseEngine *t = scratch->core_info.rose; + + u64a real_end = end + tctx->lit_offset_adjust; + +#if defined(DEBUG) + DEBUG_PRINTF("MATCH id=%u end offset@%llu: ", id, real_end); + u64a start = real_end < 8 ? 1 : real_end - 7; + printMatch(&scratch->core_info, start, real_end); + printf("\n"); +#endif + DEBUG_PRINTF("last end %llu\n", tctx->lastEndOffset); + + DEBUG_PRINTF("STATE groups=0x%016llx\n", tctx->groups); + + if (can_stop_matching(scratch)) { + DEBUG_PRINTF("received a match when we're already dead!\n"); + return HWLM_TERMINATE_MATCHING; + } + + hwlmcb_rv_t rv = flushQueuedLiterals(t, scratch, real_end); + /* flushDelayed may have advanced tctx->lastEndOffset */ + + if (real_end >= t->floatingMinLiteralMatchOffset) { + roseFlushLastByteHistory(t, scratch, real_end); + tctx->lastEndOffset = real_end; + } + + if (rv == HWLM_TERMINATE_MATCHING) { + return HWLM_TERMINATE_MATCHING; + } + + rv = roseProcessMatchInline(t, scratch, real_end, id); + + DEBUG_PRINTF("DONE groups=0x%016llx\n", tctx->groups); + + if (rv != HWLM_TERMINATE_MATCHING) { + return tctx->groups; + } + + assert(can_stop_matching(scratch)); + DEBUG_PRINTF("user requested halt\n"); + return HWLM_TERMINATE_MATCHING; +} + +hwlmcb_rv_t roseCallback(size_t end, u32 id, struct hs_scratch *scratch) { + return roseCallback_i(end, id, scratch); +} + +hwlmcb_rv_t roseFloatingCallback(size_t end, u32 id, + struct hs_scratch *scratch) { + const struct RoseEngine *t = scratch->core_info.rose; + + return roseCallback_i(end, id, scratch) & t->floating_group_mask; +} + +/** + * \brief Execute a boundary report program. + * + * Returns MO_HALT_MATCHING if the stream is exhausted or the user has + * instructed us to halt, or MO_CONTINUE_MATCHING otherwise. + */ +int roseRunBoundaryProgram(const struct RoseEngine *rose, u32 program, + u64a stream_offset, struct hs_scratch *scratch) { + DEBUG_PRINTF("running boundary program at offset %u\n", program); + + if (can_stop_matching(scratch)) { + DEBUG_PRINTF("can stop matching\n"); + return MO_HALT_MATCHING; + } + + if (rose->hasSom && scratch->deduper.current_report_offset == ~0ULL) { + /* we cannot delay the initialization of the som deduper logs any longer + * as we are reporting matches. This is done explicitly as we are + * shortcutting the som handling in the vacuous repeats as we know they + * all come from non-som patterns. */ + fatbit_clear(scratch->deduper.som_log[0]); + fatbit_clear(scratch->deduper.som_log[1]); + scratch->deduper.som_log_dirty = 0; + } + + // Keep assertions in program report path happy. At offset zero, there can + // have been no earlier reports. At EOD, all earlier reports should have + // been handled and we will have been caught up to the stream offset by the + // time we are running boundary report programs. + scratch->tctxt.minMatchOffset = stream_offset; + + const u64a som = 0; + const u8 flags = 0; + hwlmcb_rv_t rv = roseRunProgram(rose, scratch, program, som, stream_offset, + flags); + if (rv == HWLM_TERMINATE_MATCHING) { + return MO_HALT_MATCHING; + } + + return MO_CONTINUE_MATCHING; +} + +/** + * \brief Execute a flush combination program. + * + * Returns MO_HALT_MATCHING if the stream is exhausted or the user has + * instructed us to halt, or MO_CONTINUE_MATCHING otherwise. + */ +int roseRunFlushCombProgram(const struct RoseEngine *rose, + struct hs_scratch *scratch, u64a end) { + hwlmcb_rv_t rv = roseRunProgram(rose, scratch, rose->flushCombProgramOffset, + 0, end, 0); + if (rv == HWLM_TERMINATE_MATCHING) { + return MO_HALT_MATCHING; + } + return MO_CONTINUE_MATCHING; +} + +/** + * \brief Execute last flush combination program. + * + * Returns MO_HALT_MATCHING if the stream is exhausted or the user has + * instructed us to halt, or MO_CONTINUE_MATCHING otherwise. + */ +int roseRunLastFlushCombProgram(const struct RoseEngine *rose, + struct hs_scratch *scratch, u64a end) { + hwlmcb_rv_t rv = roseRunProgram(rose, scratch, + rose->lastFlushCombProgramOffset, + 0, end, 0); + if (rv == HWLM_TERMINATE_MATCHING) { + return MO_HALT_MATCHING; + } + return MO_CONTINUE_MATCHING; +} + +int roseReportAdaptor(u64a start, u64a end, ReportID id, void *context) { + struct hs_scratch *scratch = context; + assert(scratch && scratch->magic == SCRATCH_MAGIC); + + DEBUG_PRINTF("id=%u matched at [%llu,%llu]\n", id, start, end); + + const struct RoseEngine *rose = scratch->core_info.rose; + + // Our match ID is the program offset. + const u32 program = id; + const u8 flags = ROSE_PROG_FLAG_SKIP_MPV_CATCHUP; + hwlmcb_rv_t rv; + if (rose->pureLiteral) { + rv = roseRunProgram_l(rose, scratch, program, start, end, flags); + } else { + rv = roseRunProgram(rose, scratch, program, start, end, flags); + } + if (rv == HWLM_TERMINATE_MATCHING) { + return MO_HALT_MATCHING; + } + + return can_stop_matching(scratch) ? MO_HALT_MATCHING : MO_CONTINUE_MATCHING; +} diff --git a/regex/rose/match.h b/regex/rose/match.h new file mode 100644 index 000000000..c03b1ebba --- /dev/null +++ b/regex/rose/match.h @@ -0,0 +1,383 @@ +/* + * Copyright (c) 2015-2018, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef ROSE_MATCH_H +#define ROSE_MATCH_H + +#include "catchup.h" +#include "runtime.h" +#include "scratch.h" +#include "report.h" +#include "rose_common.h" +#include "rose_internal.h" +#include "ue2common.h" +#include "hwlm/hwlm.h" +#include "nfa/nfa_api.h" +#include "nfa/nfa_api_queue.h" +#include "nfa/nfa_api_util.h" +#include "som/som_runtime.h" +#include "util/bitutils.h" +#include "util/exhaust.h" +#include "util/fatbit.h" +#include "util/multibit.h" + +/* Callbacks, defined in catchup.c */ + +int roseNfaAdaptor(u64a start, u64a end, ReportID id, void *context); + +/* Callbacks, defined in match.c */ + +hwlmcb_rv_t roseCallback(size_t end, u32 id, struct hs_scratch *scratch); +hwlmcb_rv_t roseFloatingCallback(size_t end, u32 id, + struct hs_scratch *scratch); +hwlmcb_rv_t roseDelayRebuildCallback(size_t end, u32 id, + struct hs_scratch *scratch); +int roseAnchoredCallback(u64a start, u64a end, u32 id, void *ctx); + +/* Common code, used all over Rose runtime */ + +hwlmcb_rv_t roseHandleChainMatch(const struct RoseEngine *t, + struct hs_scratch *scratch, u32 event, + u64a top_squash_distance, u64a end, + char in_catchup); + +/** \brief Initialize the queue for a suffix/outfix engine. */ +static really_inline +void initQueue(struct mq *q, u32 qi, const struct RoseEngine *t, + struct hs_scratch *scratch) { + const struct NfaInfo *info = getNfaInfoByQueue(t, qi); + assert(scratch->fullState); + q->nfa = getNfaByInfo(t, info); + q->end = 0; + q->cur = 0; + q->state = scratch->fullState + info->fullStateOffset; + q->streamState = scratch->core_info.state + info->stateOffset; + q->offset = scratch->core_info.buf_offset; + q->buffer = scratch->core_info.buf; + q->length = scratch->core_info.len; + q->history = scratch->core_info.hbuf; + q->hlength = scratch->core_info.hlen; + q->cb = roseNfaAdaptor; + q->context = scratch; + q->report_current = 0; + + DEBUG_PRINTF("qi=%u, offset=%llu, fullState=%u, streamState=%u, " + "state=%u\n", qi, q->offset, info->fullStateOffset, + info->stateOffset, *(u32 *)q->state); +} + +/** \brief Initialize the queue for a leftfix (prefix/infix) engine. */ +static really_inline +void initRoseQueue(const struct RoseEngine *t, u32 qi, + const struct LeftNfaInfo *left, + struct hs_scratch *scratch) { + struct mq *q = scratch->queues + qi; + const struct NfaInfo *info = getNfaInfoByQueue(t, qi); + q->nfa = getNfaByInfo(t, info); + q->end = 0; + q->cur = 0; + q->state = scratch->fullState + info->fullStateOffset; + + // Transient roses don't have stream state, we use tstate in scratch + // instead. The only reason we need this at ALL is for LimEx extended + // regions, which assume that they have access to q->streamState + + // compressedStateSize. + if (left->transient) { + q->streamState = (char *)scratch->tstate + info->stateOffset; + } else { + q->streamState = scratch->core_info.state + info->stateOffset; + } + + q->offset = scratch->core_info.buf_offset; + q->buffer = scratch->core_info.buf; + q->length = scratch->core_info.len; + q->history = scratch->core_info.hbuf; + q->hlength = scratch->core_info.hlen; + q->cb = NULL; + q->context = NULL; + q->report_current = 0; + + DEBUG_PRINTF("qi=%u, offset=%llu, fullState=%u, streamState=%u, " + "state=%u\n", qi, q->offset, info->fullStateOffset, + info->stateOffset, *(u32 *)q->state); +} + +/** returns 0 if space for two items (top and end) on the queue */ +static really_inline +char isQueueFull(const struct mq *q) { + return q->end + 2 > MAX_MQE_LEN; +} + +static really_inline +void loadStreamState(const struct NFA *nfa, struct mq *q, s64a loc) { + DEBUG_PRINTF("offset=%llu, length=%zu, hlength=%zu, loc=%lld\n", + q->offset, q->length, q->hlength, loc); + nfaExpandState(nfa, q->state, q->streamState, q->offset + loc, + queue_prev_byte(q, loc)); +} + +static really_inline +void storeRoseDelay(const struct RoseEngine *t, char *state, + const struct LeftNfaInfo *left, u32 loc) { + u32 di = left->lagIndex; + if (di == ROSE_OFFSET_INVALID) { + return; + } + + assert(loc < 256); // ONE WHOLE BYTE! + DEBUG_PRINTF("storing rose delay %u in slot %u\n", loc, di); + u8 *leftfixDelay = getLeftfixLagTable(t, state); + assert(loc <= MAX_STORED_LEFTFIX_LAG); + leftfixDelay[di] = loc; +} + +static really_inline +void setAsZombie(const struct RoseEngine *t, char *state, + const struct LeftNfaInfo *left) { + u32 di = left->lagIndex; + assert(di != ROSE_OFFSET_INVALID); + if (di == ROSE_OFFSET_INVALID) { + return; + } + + u8 *leftfixDelay = getLeftfixLagTable(t, state); + leftfixDelay[di] = OWB_ZOMBIE_ALWAYS_YES; +} + +/* loadRoseDelay MUST NOT be called on the first stream write as it is only + * initialized for running nfas on stream boundaries */ +static really_inline +u32 loadRoseDelay(const struct RoseEngine *t, const char *state, + const struct LeftNfaInfo *left) { + u32 di = left->lagIndex; + if (di == ROSE_OFFSET_INVALID) { + return 0; + } + + const u8 *leftfixDelay = getLeftfixLagTableConst(t, state); + u32 loc = leftfixDelay[di]; + DEBUG_PRINTF("read rose delay %u from slot %u\n", loc, di); + return loc; +} + +static really_inline +char isZombie(const struct RoseEngine *t, const char *state, + const struct LeftNfaInfo *left) { + u32 di = left->lagIndex; + assert(di != ROSE_OFFSET_INVALID); + if (di == ROSE_OFFSET_INVALID) { + return 0; + } + + const u8 *leftfixDelay = getLeftfixLagTableConst(t, state); + DEBUG_PRINTF("read owb %hhu from slot %u\n", leftfixDelay[di], di); + return leftfixDelay[di] == OWB_ZOMBIE_ALWAYS_YES; +} + +hwlmcb_rv_t flushQueuedLiterals_i(const struct RoseEngine *t, + struct hs_scratch *scratch, u64a end); + +static really_inline +hwlmcb_rv_t flushQueuedLiterals(const struct RoseEngine *t, + struct hs_scratch *scratch, u64a end) { + struct RoseContext *tctxt = &scratch->tctxt; + + if (tctxt->delayLastEndOffset == end) { + DEBUG_PRINTF("no progress, no flush\n"); + return HWLM_CONTINUE_MATCHING; + } + + if (!tctxt->filledDelayedSlots && !scratch->al_log_sum) { + tctxt->delayLastEndOffset = end; + return HWLM_CONTINUE_MATCHING; + } + + return flushQueuedLiterals_i(t, scratch, end); +} + +static really_inline +hwlmcb_rv_t cleanUpDelayed(const struct RoseEngine *t, + struct hs_scratch *scratch, size_t length, + u64a offset) { + if (can_stop_matching(scratch)) { + return HWLM_TERMINATE_MATCHING; + } + + if (flushQueuedLiterals(t, scratch, length + offset) + == HWLM_TERMINATE_MATCHING) { + return HWLM_TERMINATE_MATCHING; + } + + struct RoseContext *tctxt = &scratch->tctxt; + if (tctxt->filledDelayedSlots) { + DEBUG_PRINTF("dirty\n"); + scratch->core_info.status |= STATUS_DELAY_DIRTY; + } else { + scratch->core_info.status &= ~STATUS_DELAY_DIRTY; + } + + tctxt->filledDelayedSlots = 0; + tctxt->delayLastEndOffset = offset; + + return HWLM_CONTINUE_MATCHING; +} + +static rose_inline +void roseFlushLastByteHistory(const struct RoseEngine *t, + struct hs_scratch *scratch, u64a currEnd) { + if (!t->lastByteHistoryIterOffset) { + return; + } + + struct RoseContext *tctxt = &scratch->tctxt; + struct core_info *ci = &scratch->core_info; + + /* currEnd is last byte of string + 1 */ + if (tctxt->lastEndOffset == ci->buf_offset + ci->len + || currEnd != ci->buf_offset + ci->len) { + /* already flushed or it is not yet time to flush */ + return; + } + + DEBUG_PRINTF("flushing\n"); + + const struct mmbit_sparse_iter *it = + getByOffset(t, t->lastByteHistoryIterOffset); + assert(ISALIGNED(it)); + + const u32 numStates = t->rolesWithStateCount; + void *role_state = getRoleState(scratch->core_info.state); + + struct mmbit_sparse_state si_state[MAX_SPARSE_ITER_STATES]; + + mmbit_sparse_iter_unset(role_state, numStates, it, si_state); +} + +static rose_inline +int roseHasInFlightMatches(const struct RoseEngine *t, char *state, + const struct hs_scratch *scratch) { + if (scratch->al_log_sum) { + DEBUG_PRINTF("anchored literals in log\n"); + return 1; + } + + if (scratch->tctxt.filledDelayedSlots) { + DEBUG_PRINTF("delayed literal\n"); + return 1; + } + + if (mmbit_any(getRoleState(state), t->rolesWithStateCount)) { + DEBUG_PRINTF("role state is set\n"); + return 1; + } + + return 0; +} + +static rose_inline +hwlmcb_rv_t roseHaltIfExhausted(const struct RoseEngine *t, + struct hs_scratch *scratch) { + struct core_info *ci = &scratch->core_info; + if (isAllExhausted(t, ci->exhaustionVector)) { + ci->status |= STATUS_EXHAUSTED; + scratch->tctxt.groups = 0; + DEBUG_PRINTF("all exhausted, termination requested\n"); + return HWLM_TERMINATE_MATCHING; + } + + return HWLM_CONTINUE_MATCHING; +} + +static really_inline +hwlmcb_rv_t ensureQueueFlushed_i(const struct RoseEngine *t, + struct hs_scratch *scratch, u32 qi, s64a loc, + char is_mpv, char in_catchup) { + struct RoseContext *tctxt = &scratch->tctxt; + u8 *aa = getActiveLeafArray(t, scratch->core_info.state); + struct fatbit *activeQueues = scratch->aqa; + u32 aaCount = t->activeArrayCount; + u32 qCount = t->queueCount; + + struct mq *q = &scratch->queues[qi]; + DEBUG_PRINTF("qcl %lld, loc: %lld, min (non mpv) match offset: %llu\n", + q_cur_loc(q), loc, tctxt->minNonMpvMatchOffset); + if (q_cur_loc(q) == loc) { + /* too many tops enqueued at the one spot; need to flatten this queue. + * We can use the full catchups as it will short circuit as we are + * already at this location. It also saves waking everybody up */ + pushQueueNoMerge(q, MQE_END, loc); + nfaQueueExec(q->nfa, q, loc); + q->cur = q->end = 0; + pushQueueAt(q, 0, MQE_START, loc); + } else if (!in_catchup) { + if (is_mpv) { + tctxt->next_mpv_offset = 0; /* force us to catch the mpv */ + if (loc + scratch->core_info.buf_offset + <= tctxt->minNonMpvMatchOffset) { + DEBUG_PRINTF("flushing chained\n"); + if (roseCatchUpMPV(t, loc, scratch) == + HWLM_TERMINATE_MATCHING) { + return HWLM_TERMINATE_MATCHING; + } + goto done_queue_empty; + } + } + + if (roseCatchUpTo(t, scratch, loc + scratch->core_info.buf_offset) == + HWLM_TERMINATE_MATCHING) { + return HWLM_TERMINATE_MATCHING; + } + } else { + /* we must be a chained nfa */ + assert(is_mpv); + DEBUG_PRINTF("flushing chained\n"); + tctxt->next_mpv_offset = 0; /* force us to catch the mpv */ + if (roseCatchUpMPV(t, loc, scratch) == HWLM_TERMINATE_MATCHING) { + return HWLM_TERMINATE_MATCHING; + } + } +done_queue_empty: + if (!mmbit_set(aa, aaCount, qi)) { + initQueue(q, qi, t, scratch); + nfaQueueInitState(q->nfa, q); + pushQueueAt(q, 0, MQE_START, loc); + fatbit_set(activeQueues, qCount, qi); + } + + assert(!isQueueFull(q)); + + return roseHaltIfExhausted(t, scratch); +} + +static rose_inline +hwlmcb_rv_t ensureQueueFlushed(const struct RoseEngine *t, + struct hs_scratch *scratch, u32 qi, s64a loc) { + return ensureQueueFlushed_i(t, scratch, qi, loc, 0, 0); +} + +#endif diff --git a/regex/rose/miracle.h b/regex/rose/miracle.h new file mode 100644 index 000000000..604c50205 --- /dev/null +++ b/regex/rose/miracle.h @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2015, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef ROSE_MIRACLE_H +#define ROSE_MIRACLE_H + +#include "ue2common.h" +#include "runtime.h" +#include "rose_internal.h" + +/** \brief Maximum number of bytes to scan when looking for a "miracle" stop + * character. */ +#define MIRACLE_LEN_MAX 32 + +static really_inline +u64a roseMiracleScan(const u8 *stop, const u8 *d, const u8 *d_start) { + assert(d >= d_start); + + // Note: unrolling this loop manually does appear to reduce its + // performance. I'm sick of tilting at this particular windmill. + + u32 mshift = 0; + do { + u64a s = (u64a)stop[*d]; + if (s) { + s <<= mshift; + return s; + } + mshift++; + } while (--d >= d_start); + return 0; +} + +/** + * \brief "Miracle" scan: uses stop table to check if we can skip forward to a + * location where we know that the given rose engine will be in a known state. + * + * Scans the buffer/history between relative locations \a begin_loc and \a + * end_loc, and returns a miracle location (if any) that appears in the stream + * after \a begin_loc. + * + * Returns 1 if some bytes can be skipped and sets \a miracle_loc + * appropriately, 0 otherwise. + */ +static rose_inline +char roseMiracleOccurs(const struct RoseEngine *t, + const struct LeftNfaInfo *left, + const struct core_info *ci, const s64a begin_loc, + const s64a end_loc, s64a *miracle_loc) { + assert(!left->transient); + assert(left->stopTable); + + DEBUG_PRINTF("looking for miracle over [%lld,%lld], maxLag=%u\n", + begin_loc, end_loc, left->maxLag); + DEBUG_PRINTF("ci->len=%zu, ci->hlen=%zu\n", ci->len, ci->hlen); + + assert(begin_loc <= end_loc); + assert(begin_loc >= -(s64a)ci->hlen); + assert(end_loc <= (s64a)ci->len); + + const u8 *stop = getByOffset(t, left->stopTable); + + const s64a scan_end_loc = end_loc - left->maxLag; + if (scan_end_loc <= begin_loc) { + DEBUG_PRINTF("nothing to scan\n"); + return 0; + } + + const s64a start = MAX(begin_loc, scan_end_loc - MIRACLE_LEN_MAX); + DEBUG_PRINTF("scan [%lld..%lld]\n", start, scan_end_loc); + + u64a s = 0; // state, on bits are miracle locations + + // Scan buffer. + const s64a buf_scan_start = MAX(0, start); + if (scan_end_loc > buf_scan_start) { + const u8 *buf = ci->buf; + const u8 *d = buf + scan_end_loc - 1; + const u8 *d_start = buf + buf_scan_start; + s = roseMiracleScan(stop, d, d_start); + if (s) { + goto miracle_found; + } + } + + // Scan history. + if (start < 0) { + const u8 *hbuf_end = ci->hbuf + ci->hlen; + const u8 *d = hbuf_end + MIN(0, scan_end_loc) - 1; + const u8 *d_start = hbuf_end + start; + s = roseMiracleScan(stop, d, d_start); + if (scan_end_loc > 0) { + // Shift s over to account for the buffer scan above. + s <<= scan_end_loc; + } + } + + if (s) { + miracle_found: + DEBUG_PRINTF("s=0x%llx, ctz=%u\n", s, ctz64(s)); + s64a loc = end_loc - left->maxLag - ctz64(s) - 1; + if (loc > begin_loc) { + DEBUG_PRINTF("miracle at %lld\n", loc); + *miracle_loc = loc; + return 1; + } + } + + DEBUG_PRINTF("no viable miraculous stop characters found\n"); + return 0; +} + +#endif // ROSE_MIRACLE_H diff --git a/regex/rose/program_runtime.c b/regex/rose/program_runtime.c new file mode 100644 index 000000000..e6d1d5ae3 --- /dev/null +++ b/regex/rose/program_runtime.c @@ -0,0 +1,3509 @@ +/* + * Copyright (c) 2015-2020, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * \file + * \brief Rose runtime: program interpreter. + */ + +#include "program_runtime.h" + +#include "catchup.h" +#include "counting_miracle.h" +#include "infix.h" +#include "match.h" +#include "miracle.h" +#include "report.h" +#include "rose_common.h" +#include "rose_internal.h" +#include "rose_program.h" +#include "rose_types.h" +#include "validate_mask.h" +#include "validate_shufti.h" +#include "runtime.h" +#include "util/compare.h" +#include "util/copybytes.h" +#include "util/fatbit.h" +#include "util/multibit.h" + +/* Inline implementation follows. */ + +static rose_inline +void rosePushDelayedMatch(const struct RoseEngine *t, + struct hs_scratch *scratch, u32 delay, + u32 delay_index, u64a offset) { + assert(delay); + + const u32 src_slot_index = delay; + u32 slot_index = (src_slot_index + offset) & DELAY_MASK; + + struct RoseContext *tctxt = &scratch->tctxt; + if (offset + src_slot_index <= tctxt->delayLastEndOffset) { + DEBUG_PRINTF("skip too late\n"); + return; + } + + const u32 delay_count = t->delay_count; + struct fatbit **delaySlots = getDelaySlots(scratch); + struct fatbit *slot = delaySlots[slot_index]; + + DEBUG_PRINTF("pushing tab %u into slot %u\n", delay_index, slot_index); + if (!(tctxt->filledDelayedSlots & (1U << slot_index))) { + tctxt->filledDelayedSlots |= 1U << slot_index; + fatbit_clear(slot); + } + + fatbit_set(slot, delay_count, delay_index); +} + +static rose_inline +void recordAnchoredLiteralMatch(const struct RoseEngine *t, + struct hs_scratch *scratch, u32 anch_id, + u64a end) { + assert(end); + + if (end <= t->floatingMinLiteralMatchOffset) { + return; + } + + struct fatbit **anchoredLiteralRows = getAnchoredLiteralLog(scratch); + + DEBUG_PRINTF("record %u (of %u) @ %llu\n", anch_id, t->anchored_count, end); + + if (!bf64_set(&scratch->al_log_sum, end - 1)) { + // first time, clear row + DEBUG_PRINTF("clearing %llu/%u\n", end - 1, t->anchored_count); + fatbit_clear(anchoredLiteralRows[end - 1]); + } + + assert(anch_id < t->anchored_count); + fatbit_set(anchoredLiteralRows[end - 1], t->anchored_count, anch_id); +} + +static rose_inline +char roseLeftfixCheckMiracles(const struct RoseEngine *t, + const struct LeftNfaInfo *left, + struct core_info *ci, struct mq *q, u64a end, + const char is_infix) { + if (!is_infix && left->transient) { + // Miracles won't help us with transient leftfix engines; they only + // scan for a limited time anyway. + return 1; + } + + if (!left->stopTable) { + return 1; + } + + DEBUG_PRINTF("looking for miracle on queue %u\n", q->nfa->queueIndex); + + const s64a begin_loc = q_cur_loc(q); + const s64a end_loc = end - ci->buf_offset; + + s64a miracle_loc; + if (roseMiracleOccurs(t, left, ci, begin_loc, end_loc, &miracle_loc)) { + goto found_miracle; + } + + if (roseCountingMiracleOccurs(t, left, ci, begin_loc, end_loc, + &miracle_loc)) { + goto found_miracle; + } + + return 1; + +found_miracle: + DEBUG_PRINTF("miracle at %lld\n", miracle_loc); + assert(miracle_loc >= begin_loc); + + // If we're a prefix, then a miracle effectively results in us needing to + // re-init our state and start fresh. + if (!is_infix) { + if (miracle_loc != begin_loc) { + DEBUG_PRINTF("re-init prefix state\n"); + q->cur = q->end = 0; + pushQueueAt(q, 0, MQE_START, miracle_loc); + pushQueueAt(q, 1, MQE_TOP, miracle_loc); + nfaQueueInitState(q->nfa, q); + } + return 1; + } + + // Otherwise, we're an infix. Remove tops before the miracle from the queue + // and re-init at that location. + + q_skip_forward_to(q, miracle_loc); + + if (q_last_type(q) == MQE_START) { + DEBUG_PRINTF("miracle caused infix to die\n"); + return 0; + } + + DEBUG_PRINTF("re-init infix state\n"); + assert(q->items[q->cur].type == MQE_START); + q->items[q->cur].location = miracle_loc; + nfaQueueInitState(q->nfa, q); + + return 1; +} + +static rose_inline +hwlmcb_rv_t roseTriggerSuffix(const struct RoseEngine *t, + struct hs_scratch *scratch, u32 qi, u32 top, + u64a som, u64a end) { + DEBUG_PRINTF("suffix qi=%u, top event=%u\n", qi, top); + + struct core_info *ci = &scratch->core_info; + u8 *aa = getActiveLeafArray(t, ci->state); + const u32 aaCount = t->activeArrayCount; + const u32 qCount = t->queueCount; + struct mq *q = &scratch->queues[qi]; + const struct NfaInfo *info = getNfaInfoByQueue(t, qi); + const struct NFA *nfa = getNfaByInfo(t, info); + + s64a loc = (s64a)end - ci->buf_offset; + assert(loc <= (s64a)ci->len && loc >= -(s64a)ci->hlen); + + if (!mmbit_set(aa, aaCount, qi)) { + initQueue(q, qi, t, scratch); + nfaQueueInitState(nfa, q); + pushQueueAt(q, 0, MQE_START, loc); + fatbit_set(scratch->aqa, qCount, qi); + } else if (info->no_retrigger) { + DEBUG_PRINTF("yawn\n"); + /* nfa only needs one top; we can go home now */ + return HWLM_CONTINUE_MATCHING; + } else if (!fatbit_set(scratch->aqa, qCount, qi)) { + initQueue(q, qi, t, scratch); + loadStreamState(nfa, q, 0); + pushQueueAt(q, 0, MQE_START, 0); + } else if (isQueueFull(q)) { + DEBUG_PRINTF("queue %u full -> catching up nfas\n", qi); + if (info->eod) { + /* can catch up suffix independently no pq */ + q->context = NULL; + pushQueueNoMerge(q, MQE_END, loc); + nfaQueueExecRose(q->nfa, q, MO_INVALID_IDX); + q->cur = q->end = 0; + pushQueueAt(q, 0, MQE_START, loc); + } else if (ensureQueueFlushed(t, scratch, qi, loc) + == HWLM_TERMINATE_MATCHING) { + return HWLM_TERMINATE_MATCHING; + } + } + + assert(top == MQE_TOP || (top >= MQE_TOP_FIRST && top < MQE_INVALID)); + pushQueueSom(q, top, loc, som); + + if (q_cur_loc(q) == (s64a)ci->len && !info->eod) { + /* we may not run the nfa; need to ensure state is fine */ + DEBUG_PRINTF("empty run\n"); + pushQueueNoMerge(q, MQE_END, loc); + char alive = nfaQueueExec(nfa, q, loc); + if (alive) { + q->cur = q->end = 0; + pushQueueAt(q, 0, MQE_START, loc); + } else { + mmbit_unset(aa, aaCount, qi); + fatbit_unset(scratch->aqa, qCount, qi); + } + } + + return HWLM_CONTINUE_MATCHING; +} + +static really_inline +char roseTestLeftfix(const struct RoseEngine *t, struct hs_scratch *scratch, + u32 qi, u32 leftfixLag, ReportID leftfixReport, u64a end, + const char is_infix) { + struct core_info *ci = &scratch->core_info; + + u32 ri = queueToLeftIndex(t, qi); + const struct LeftNfaInfo *left = getLeftTable(t) + ri; + + DEBUG_PRINTF("testing %s %s %u/%u with lag %u (maxLag=%u)\n", + (left->transient ? "transient" : "active"), + (is_infix ? "infix" : "prefix"), + ri, qi, leftfixLag, left->maxLag); + + assert(leftfixLag <= left->maxLag); + assert(left->infix == is_infix); + assert(!is_infix || !left->transient); // Only prefixes can be transient. + + struct mq *q = scratch->queues + qi; + char *state = scratch->core_info.state; + u8 *activeLeftArray = getActiveLeftArray(t, state); + u32 qCount = t->queueCount; + u32 arCount = t->activeLeftCount; + + if (!mmbit_isset(activeLeftArray, arCount, ri)) { + DEBUG_PRINTF("engine is dead nothing to see here\n"); + return 0; + } + + if (unlikely(end < leftfixLag)) { + assert(0); /* lag is the literal length */ + return 0; + } + + if (nfaSupportsZombie(getNfaByQueue(t, qi)) && ci->buf_offset + && !fatbit_isset(scratch->aqa, qCount, qi) + && isZombie(t, state, left)) { + DEBUG_PRINTF("zombie\n"); + return 1; + } + + if (!fatbit_set(scratch->aqa, qCount, qi)) { + DEBUG_PRINTF("initing q %u\n", qi); + initRoseQueue(t, qi, left, scratch); + if (ci->buf_offset) { // there have been writes before us! + s32 sp; + if (!is_infix && left->transient) { + sp = -(s32)ci->hlen; + } else { + sp = -(s32)loadRoseDelay(t, state, left); + } + + /* transient nfas are always started fresh -> state not maintained + * at stream boundary */ + + pushQueueAt(q, 0, MQE_START, sp); + if (is_infix || (ci->buf_offset + sp > 0 && !left->transient)) { + loadStreamState(q->nfa, q, sp); + } else { + pushQueueAt(q, 1, MQE_TOP, sp); + nfaQueueInitState(q->nfa, q); + } + } else { // first write ever + pushQueueAt(q, 0, MQE_START, 0); + pushQueueAt(q, 1, MQE_TOP, 0); + nfaQueueInitState(q->nfa, q); + } + } + + s64a loc = (s64a)end - ci->buf_offset - leftfixLag; + assert(loc >= q_cur_loc(q) || left->eager); + assert(leftfixReport != MO_INVALID_IDX); + + if (!is_infix && left->transient) { + s64a start_loc = loc - left->transient; + if (q_cur_loc(q) < start_loc) { + q->cur = q->end = 0; + pushQueueAt(q, 0, MQE_START, start_loc); + pushQueueAt(q, 1, MQE_TOP, start_loc); + nfaQueueInitState(q->nfa, q); + } + } + + if (q_cur_loc(q) < loc || q_last_type(q) != MQE_START) { + if (is_infix) { + if (infixTooOld(q, loc)) { + DEBUG_PRINTF("infix %u died of old age\n", ri); + goto nfa_dead; + } + + reduceInfixQueue(q, loc, left->maxQueueLen, q->nfa->maxWidth); + } + + if (!roseLeftfixCheckMiracles(t, left, ci, q, end, is_infix)) { + DEBUG_PRINTF("leftfix %u died due to miracle\n", ri); + goto nfa_dead; + } + +#ifdef DEBUG + debugQueue(q); +#endif + + pushQueueNoMerge(q, MQE_END, loc); + + char rv = nfaQueueExecRose(q->nfa, q, leftfixReport); + if (!rv) { /* nfa is dead */ + DEBUG_PRINTF("leftfix %u died while trying to catch up\n", ri); + goto nfa_dead; + } + + // Queue must have next start loc before we call nfaInAcceptState. + q->cur = q->end = 0; + pushQueueAt(q, 0, MQE_START, loc); + + DEBUG_PRINTF("checking for report %u\n", leftfixReport); + DEBUG_PRINTF("leftfix done %hhd\n", (signed char)rv); + return rv == MO_MATCHES_PENDING; + } else if (q_cur_loc(q) > loc) { + /* an eager leftfix may have already progressed past loc if there is no + * match at loc. */ + assert(left->eager); + return 0; + } else { + assert(q_cur_loc(q) == loc); + DEBUG_PRINTF("checking for report %u\n", leftfixReport); + char rv = nfaInAcceptState(q->nfa, leftfixReport, q); + DEBUG_PRINTF("leftfix done %hhd\n", (signed char)rv); + return rv; + } + +nfa_dead: + mmbit_unset(activeLeftArray, arCount, ri); + scratch->tctxt.groups &= left->squash_mask; + return 0; +} + +static rose_inline +char roseTestPrefix(const struct RoseEngine *t, struct hs_scratch *scratch, + u32 qi, u32 leftfixLag, ReportID leftfixReport, u64a end) { + return roseTestLeftfix(t, scratch, qi, leftfixLag, leftfixReport, end, 0); +} + +static rose_inline +char roseTestInfix(const struct RoseEngine *t, struct hs_scratch *scratch, + u32 qi, u32 leftfixLag, ReportID leftfixReport, u64a end) { + return roseTestLeftfix(t, scratch, qi, leftfixLag, leftfixReport, end, 1); +} + +static rose_inline +void roseTriggerInfix(const struct RoseEngine *t, struct hs_scratch *scratch, + u64a start, u64a end, u32 qi, u32 topEvent, u8 cancel) { + struct core_info *ci = &scratch->core_info; + s64a loc = (s64a)end - ci->buf_offset; + + u32 ri = queueToLeftIndex(t, qi); + assert(topEvent < MQE_INVALID); + + const struct LeftNfaInfo *left = getLeftInfoByQueue(t, qi); + assert(!left->transient); + + DEBUG_PRINTF("rose %u (qi=%u) event %u\n", ri, qi, topEvent); + + struct mq *q = scratch->queues + qi; + const struct NfaInfo *info = getNfaInfoByQueue(t, qi); + + char *state = ci->state; + u8 *activeLeftArray = getActiveLeftArray(t, state); + const u32 arCount = t->activeLeftCount; + char alive = mmbit_set(activeLeftArray, arCount, ri); + + if (alive && info->no_retrigger) { + DEBUG_PRINTF("yawn\n"); + return; + } + + struct fatbit *aqa = scratch->aqa; + const u32 qCount = t->queueCount; + + if (alive && nfaSupportsZombie(getNfaByInfo(t, info)) && ci->buf_offset && + !fatbit_isset(aqa, qCount, qi) && isZombie(t, state, left)) { + DEBUG_PRINTF("yawn - zombie\n"); + return; + } + + if (cancel) { + DEBUG_PRINTF("dominating top: (re)init\n"); + fatbit_set(aqa, qCount, qi); + initRoseQueue(t, qi, left, scratch); + pushQueueAt(q, 0, MQE_START, loc); + nfaQueueInitState(q->nfa, q); + } else if (!fatbit_set(aqa, qCount, qi)) { + DEBUG_PRINTF("initing %u\n", qi); + initRoseQueue(t, qi, left, scratch); + if (alive) { + s32 sp = -(s32)loadRoseDelay(t, state, left); + pushQueueAt(q, 0, MQE_START, sp); + loadStreamState(q->nfa, q, sp); + } else { + pushQueueAt(q, 0, MQE_START, loc); + nfaQueueInitState(q->nfa, q); + } + } else if (!alive) { + q->cur = q->end = 0; + pushQueueAt(q, 0, MQE_START, loc); + nfaQueueInitState(q->nfa, q); + } else if (isQueueFull(q)) { + reduceInfixQueue(q, loc, left->maxQueueLen, q->nfa->maxWidth); + + if (isQueueFull(q)) { + /* still full - reduceInfixQueue did nothing */ + DEBUG_PRINTF("queue %u full (%u items) -> catching up nfa\n", qi, + q->end - q->cur); + pushQueueNoMerge(q, MQE_END, loc); + nfaQueueExecRose(q->nfa, q, MO_INVALID_IDX); + + q->cur = q->end = 0; + pushQueueAt(q, 0, MQE_START, loc); + } + } + + pushQueueSom(q, topEvent, loc, start); +} + +static rose_inline +hwlmcb_rv_t roseReport(const struct RoseEngine *t, struct hs_scratch *scratch, + u64a end, ReportID onmatch, s32 offset_adjust, + u32 ekey) { + DEBUG_PRINTF("firing callback onmatch=%u, end=%llu\n", onmatch, end); + updateLastMatchOffset(&scratch->tctxt, end); + + int cb_rv = roseDeliverReport(end, onmatch, offset_adjust, scratch, ekey); + if (cb_rv == MO_HALT_MATCHING) { + DEBUG_PRINTF("termination requested\n"); + return HWLM_TERMINATE_MATCHING; + } + + if (ekey == INVALID_EKEY || cb_rv == ROSE_CONTINUE_MATCHING_NO_EXHAUST) { + return HWLM_CONTINUE_MATCHING; + } + + return roseHaltIfExhausted(t, scratch); +} + +static rose_inline +hwlmcb_rv_t roseReportComb(const struct RoseEngine *t, + struct hs_scratch *scratch, u64a end, + ReportID onmatch, s32 offset_adjust, u32 ekey) { + DEBUG_PRINTF("firing callback onmatch=%u, end=%llu\n", onmatch, end); + + int cb_rv = roseDeliverReport(end, onmatch, offset_adjust, scratch, ekey); + if (cb_rv == MO_HALT_MATCHING) { + DEBUG_PRINTF("termination requested\n"); + return HWLM_TERMINATE_MATCHING; + } + + if (ekey == INVALID_EKEY || cb_rv == ROSE_CONTINUE_MATCHING_NO_EXHAUST) { + return HWLM_CONTINUE_MATCHING; + } + + return roseHaltIfExhausted(t, scratch); +} + +/* catches up engines enough to ensure any earlier mpv triggers are enqueued + * and then adds the trigger to the mpv queue. */ +static rose_inline +hwlmcb_rv_t roseCatchUpAndHandleChainMatch(const struct RoseEngine *t, + struct hs_scratch *scratch, + u32 event, u64a top_squash_distance, + u64a end, const char in_catchup) { + if (!in_catchup && + roseCatchUpMpvFeeders(t, scratch, end) == HWLM_TERMINATE_MATCHING) { + return HWLM_TERMINATE_MATCHING; + } + return roseHandleChainMatch(t, scratch, event, top_squash_distance, end, + in_catchup); +} + +static rose_inline +void roseHandleSom(struct hs_scratch *scratch, const struct som_operation *sr, + u64a end) { + DEBUG_PRINTF("end=%llu, minMatchOffset=%llu\n", end, + scratch->tctxt.minMatchOffset); + + updateLastMatchOffset(&scratch->tctxt, end); + handleSomInternal(scratch, sr, end); +} + +static rose_inline +hwlmcb_rv_t roseReportSom(const struct RoseEngine *t, + struct hs_scratch *scratch, u64a start, u64a end, + ReportID onmatch, s32 offset_adjust, u32 ekey) { + DEBUG_PRINTF("firing som callback onmatch=%u, start=%llu, end=%llu\n", + onmatch, start, end); + updateLastMatchOffset(&scratch->tctxt, end); + + int cb_rv = roseDeliverSomReport(start, end, onmatch, offset_adjust, + scratch, ekey); + if (cb_rv == MO_HALT_MATCHING) { + DEBUG_PRINTF("termination requested\n"); + return HWLM_TERMINATE_MATCHING; + } + + if (ekey == INVALID_EKEY || cb_rv == ROSE_CONTINUE_MATCHING_NO_EXHAUST) { + return HWLM_CONTINUE_MATCHING; + } + + return roseHaltIfExhausted(t, scratch); +} + +static rose_inline +void roseHandleSomSom(struct hs_scratch *scratch, + const struct som_operation *sr, u64a start, u64a end) { + DEBUG_PRINTF("start=%llu, end=%llu, minMatchOffset=%llu\n", start, end, + scratch->tctxt.minMatchOffset); + + updateLastMatchOffset(&scratch->tctxt, end); + setSomFromSomAware(scratch, sr, start, end); +} + +static rose_inline +hwlmcb_rv_t roseSetExhaust(const struct RoseEngine *t, + struct hs_scratch *scratch, u32 ekey) { + assert(scratch); + assert(scratch->magic == SCRATCH_MAGIC); + + struct core_info *ci = &scratch->core_info; + + assert(!can_stop_matching(scratch)); + assert(!isExhausted(ci->rose, ci->exhaustionVector, ekey)); + + markAsMatched(ci->rose, ci->exhaustionVector, ekey); + + return roseHaltIfExhausted(t, scratch); +} + +static really_inline +int reachHasBit(const u8 *reach, u8 c) { + return !!(reach[c / 8U] & (u8)1U << (c % 8U)); +} + +/* + * Generate a 8-byte valid_mask with #high bytes 0 from the highest side + * and #low bytes 0 from the lowest side + * and (8 - high - low) bytes '0xff' in the middle. + */ +static rose_inline +u64a generateValidMask(const s32 high, const s32 low) { + assert(high + low < 8); + DEBUG_PRINTF("high %d low %d\n", high, low); + const u64a ones = ~0ull; + return (ones << ((high + low) * 8)) >> (high * 8); +} + +/* + * Do the single-byte check if only one lookaround entry exists + * and it's a single mask. + * Return success if the byte is in the future or before history + * (offset is greater than (history) buffer length). + */ +static rose_inline +int roseCheckByte(const struct core_info *ci, u8 and_mask, u8 cmp_mask, + u8 negation, s32 checkOffset, u64a end) { + DEBUG_PRINTF("end=%llu, buf_offset=%llu, buf_end=%llu\n", end, + ci->buf_offset, ci->buf_offset + ci->len); + if (unlikely(checkOffset < 0 && (u64a)(0 - checkOffset) > end)) { + DEBUG_PRINTF("too early, fail\n"); + return 0; + } + + const s64a base_offset = end - ci->buf_offset; + s64a offset = base_offset + checkOffset; + DEBUG_PRINTF("checkOffset=%d offset=%lld\n", checkOffset, offset); + u8 c; + if (offset >= 0) { + if (offset >= (s64a)ci->len) { + DEBUG_PRINTF("in the future\n"); + return 1; + } else { + assert(offset < (s64a)ci->len); + DEBUG_PRINTF("check byte in buffer\n"); + c = ci->buf[offset]; + } + } else { + if (offset >= -(s64a) ci->hlen) { + DEBUG_PRINTF("check byte in history\n"); + c = ci->hbuf[ci->hlen + offset]; + } else { + DEBUG_PRINTF("before history and return\n"); + return 1; + } + } + + if (((and_mask & c) != cmp_mask) ^ negation) { + DEBUG_PRINTF("char 0x%02x at offset %lld failed byte check\n", + c, offset); + return 0; + } + + DEBUG_PRINTF("real offset=%lld char=%02x\n", offset, c); + DEBUG_PRINTF("OK :)\n"); + return 1; +} + +static rose_inline +int roseCheckMask(const struct core_info *ci, u64a and_mask, u64a cmp_mask, + u64a neg_mask, s32 checkOffset, u64a end) { + const s64a base_offset = (s64a)end - ci->buf_offset; + s64a offset = base_offset + checkOffset; + DEBUG_PRINTF("rel offset %lld\n",base_offset); + DEBUG_PRINTF("checkOffset %d offset %lld\n", checkOffset, offset); + if (unlikely(checkOffset < 0 && (u64a)(0 - checkOffset) > end)) { + DEBUG_PRINTF("too early, fail\n"); + return 0; + } + + u64a data = 0; + u64a valid_data_mask = ~0ULL; // mask for validate check. + //A 0xff byte means that this byte is in the buffer. + s32 shift_l = 0; // size of bytes in the future. + s32 shift_r = 0; // size of bytes before the history. + s32 h_len = 0; // size of bytes in the history buffer. + s32 c_len = 8; // size of bytes in the current buffer. + if (offset < 0) { + // in or before history buffer. + if (offset + 8 <= -(s64a)ci->hlen) { + DEBUG_PRINTF("before history and return\n"); + return 1; + } + const u8 *h_start = ci->hbuf; // start pointer in history buffer. + if (offset < -(s64a)ci->hlen) { + // some bytes are before history. + shift_r = -(offset + (s64a)ci->hlen); + DEBUG_PRINTF("shift_r %d", shift_r); + } else { + h_start += ci->hlen + offset; + } + if (offset + 7 < 0) { + DEBUG_PRINTF("all in history buffer\n"); + data = partial_load_u64a(h_start, 8 - shift_r); + } else { + // history part + c_len = offset + 8; + h_len = -offset - shift_r; + DEBUG_PRINTF("%d bytes in history\n", h_len); + s64a data_h = 0; + data_h = partial_load_u64a(h_start, h_len); + // current part + if (c_len > (s64a)ci->len) { + shift_l = c_len - ci->len; + c_len = ci->len; + } + data = partial_load_u64a(ci->buf, c_len); + data <<= h_len << 3; + data |= data_h; + } + if (shift_r) { + data <<= shift_r << 3; + } + } else { + // current buffer. + if (offset + c_len > (s64a)ci->len) { + if (offset >= (s64a)ci->len) { + DEBUG_PRINTF("all in the future\n"); + return 1; + } + // some bytes in the future. + shift_l = offset + c_len - ci->len; + c_len = ci->len - offset; + data = partial_load_u64a(ci->buf + offset, c_len); + } else { + data = unaligned_load_u64a(ci->buf + offset); + } + } + + if (shift_l || shift_r) { + valid_data_mask = generateValidMask(shift_l, shift_r); + } + DEBUG_PRINTF("valid_data_mask %llx\n", valid_data_mask); + + if (validateMask(data, valid_data_mask, + and_mask, cmp_mask, neg_mask)) { + DEBUG_PRINTF("check mask successfully\n"); + return 1; + } else { + return 0; + } +} + +static rose_inline +int roseCheckMask32(const struct core_info *ci, const u8 *and_mask, + const u8 *cmp_mask, const u32 neg_mask, + s32 checkOffset, u64a end) { + const s64a base_offset = (s64a)end - ci->buf_offset; + s64a offset = base_offset + checkOffset; + DEBUG_PRINTF("end %lld base_offset %lld\n", end, base_offset); + DEBUG_PRINTF("checkOffset %d offset %lld\n", checkOffset, offset); + + if (unlikely(checkOffset < 0 && (u64a)(0 - checkOffset) > end)) { + DEBUG_PRINTF("too early, fail\n"); + return 0; + } + + m256 data = zeroes256(); // consists of the following four parts. + s32 c_shift = 0; // blank bytes after current. + s32 h_shift = 0; // blank bytes before history. + s32 h_len = 32; // number of bytes from history buffer. + s32 c_len = 0; // number of bytes from current buffer. + /* h_shift + h_len + c_len + c_shift = 32 need to be hold.*/ + + if (offset < 0) { + s32 h_offset = 0; // the start offset in history buffer. + if (offset < -(s64a)ci->hlen) { + if (offset + 32 <= -(s64a)ci->hlen) { + DEBUG_PRINTF("all before history\n"); + return 1; + } + h_shift = -(offset + (s64a)ci->hlen); + h_len = 32 - h_shift; + } else { + h_offset = ci->hlen + offset; + } + if (offset + 32 > 0) { + // part in current buffer. + c_len = offset + 32; + h_len = -(offset + h_shift); + if (c_len > (s64a)ci->len) { + // out of current buffer. + c_shift = c_len - ci->len; + c_len = ci->len; + } + copy_upto_64_bytes((u8 *)&data - offset, ci->buf, c_len); + } + assert(h_shift + h_len + c_len + c_shift == 32); + copy_upto_64_bytes((u8 *)&data + h_shift, ci->hbuf + h_offset, h_len); + } else { + if (offset + 32 > (s64a)ci->len) { + if (offset >= (s64a)ci->len) { + DEBUG_PRINTF("all in the future.\n"); + return 1; + } + c_len = ci->len - offset; + c_shift = 32 - c_len; + assert(c_len <= 32); + copy_upto_64_bytes((u8 *)&data, ci->buf + offset, c_len); + } else { + data = loadu256(ci->buf + offset); + } + } + DEBUG_PRINTF("h_shift %d c_shift %d\n", h_shift, c_shift); + DEBUG_PRINTF("h_len %d c_len %d\n", h_len, c_len); + // we use valid_data_mask to blind bytes before history/in the future. + u32 valid_data_mask; + valid_data_mask = (~0u) << (h_shift + c_shift) >> (c_shift); + + m256 and_mask_m256 = loadu256(and_mask); + m256 cmp_mask_m256 = loadu256(cmp_mask); + if (validateMask32(data, valid_data_mask, and_mask_m256, + cmp_mask_m256, neg_mask)) { + DEBUG_PRINTF("Mask32 passed\n"); + return 1; + } + return 0; +} + +#ifdef HAVE_AVX512 +static rose_inline +int roseCheckMask64(const struct core_info *ci, const u8 *and_mask, + const u8 *cmp_mask, const u64a neg_mask, + s32 checkOffset, u64a end) { + const s64a base_offset = (s64a)end - ci->buf_offset; + s64a offset = base_offset + checkOffset; + DEBUG_PRINTF("end %lld base_offset %lld\n", end, base_offset); + DEBUG_PRINTF("checkOffset %d offset %lld\n", checkOffset, offset); + + if (unlikely(checkOffset < 0 && (u64a)(0 - checkOffset) > end)) { + DEBUG_PRINTF("too early, fail\n"); + return 0; + } + + m512 data = zeroes512(); // consists of the following four parts. + s32 c_shift = 0; // blank bytes after current. + s32 h_shift = 0; // blank bytes before history. + s32 h_len = 64; // number of bytes from history buffer. + s32 c_len = 0; // number of bytes from current buffer. + /* h_shift + h_len + c_len + c_shift = 64 need to be hold.*/ + + if (offset < 0) { + s32 h_offset = 0; // the start offset in history buffer. + if (offset < -(s64a)ci->hlen) { + if (offset + 64 <= -(s64a)ci->hlen) { + DEBUG_PRINTF("all before history\n"); + return 1; + } + h_shift = -(offset + (s64a)ci->hlen); + h_len = 64 - h_shift; + } else { + h_offset = ci->hlen + offset; + } + if (offset + 64 > 0) { + // part in current buffer. + c_len = offset + 64; + h_len = -(offset + h_shift); + if (c_len > (s64a)ci->len) { + // out of current buffer. + c_shift = c_len - ci->len; + c_len = ci->len; + } + copy_upto_64_bytes((u8 *)&data - offset, ci->buf, c_len); + } + assert(h_shift + h_len + c_len + c_shift == 64); + copy_upto_64_bytes((u8 *)&data + h_shift, ci->hbuf + h_offset, h_len); + } else { + if (offset + 64 > (s64a)ci->len) { + if (offset >= (s64a)ci->len) { + DEBUG_PRINTF("all in the future.\n"); + return 1; + } + c_len = ci->len - offset; + c_shift = 64 - c_len; + copy_upto_64_bytes((u8 *)&data, ci->buf + offset, c_len); + } else { + data = loadu512(ci->buf + offset); + } + } + DEBUG_PRINTF("h_shift %d c_shift %d\n", h_shift, c_shift); + DEBUG_PRINTF("h_len %d c_len %d\n", h_len, c_len); + // we use valid_data_mask to blind bytes before history/in the future. + u64a valid_data_mask; + valid_data_mask = (~0ULL) << (h_shift + c_shift) >> (c_shift); + + m512 and_mask_m512 = loadu512(and_mask); + m512 cmp_mask_m512 = loadu512(cmp_mask); + + if (validateMask64(data, valid_data_mask, and_mask_m512, + cmp_mask_m512, neg_mask)) { + DEBUG_PRINTF("Mask64 passed\n"); + return 1; + } + return 0; +} +#endif + +// get 128/256/512 bits data from history and current buffer. +// return data and valid_data_mask. +static rose_inline +u64a getBufferDataComplex(const struct core_info *ci, const s64a loc, + u8 *data, const u32 data_len) { + assert(data_len == 16 || data_len == 32 || data_len == 64); + s32 c_shift = 0; // blank bytes after current. + s32 h_shift = 0; // blank bytes before history. + s32 h_len = data_len; // number of bytes from history buffer. + s32 c_len = 0; // number of bytes from current buffer. + if (loc < 0) { + s32 h_offset = 0; // the start offset in history buffer. + if (loc < -(s64a)ci->hlen) { + if (loc + data_len <= -(s64a)ci->hlen) { + DEBUG_PRINTF("all before history\n"); + return 0; + } + h_shift = -(loc + (s64a)ci->hlen); + h_len = data_len - h_shift; + } else { + h_offset = ci->hlen + loc; + } + if (loc + data_len > 0) { + // part in current buffer. + c_len = loc + data_len; + h_len = -(loc + h_shift); + if (c_len > (s64a)ci->len) { + // out of current buffer. + c_shift = c_len - ci->len; + c_len = ci->len; + } + copy_upto_64_bytes(data - loc, ci->buf, c_len); + } + assert(h_shift + h_len + c_len + c_shift == (s32)data_len); + copy_upto_64_bytes(data + h_shift, ci->hbuf + h_offset, h_len); + } else { + if (loc + data_len > (s64a)ci->len) { + if (loc >= (s64a)ci->len) { + DEBUG_PRINTF("all in the future.\n"); + return 0; + } + c_len = ci->len - loc; + c_shift = data_len - c_len; + assert(c_len <= data_len); + copy_upto_64_bytes(data, ci->buf + loc, c_len); + } else { +#ifdef HAVE_AVX512 + if (data_len == 64) { + storeu512(data, loadu512(ci->buf + loc)); + return ~0ULL; + } +#endif + if (data_len == 16) { + storeu128(data, loadu128(ci->buf + loc)); + return 0xffff; + } else { + storeu256(data, loadu256(ci->buf + loc)); + return 0xffffffff; + } + } + } + DEBUG_PRINTF("h_shift %d c_shift %d\n", h_shift, c_shift); + DEBUG_PRINTF("h_len %d c_len %d\n", h_len, c_len); + +#ifdef HAVE_AVX512 + if (data_len == 64) { + return (~0ULL) << (h_shift + c_shift) >> c_shift; + } +#endif + if (data_len == 16) { + return (u16)(0xffff << (h_shift + c_shift)) >> c_shift; + } else { + return (~0u) << (h_shift + c_shift) >> c_shift; + } +} + +static rose_inline +m128 getData128(const struct core_info *ci, s64a offset, u32 *valid_data_mask) { + if (offset > 0 && offset + sizeof(m128) <= ci->len) { + *valid_data_mask = 0xffff; + return loadu128(ci->buf + offset); + } + ALIGN_DIRECTIVE u8 data[sizeof(m128)]; + *valid_data_mask = getBufferDataComplex(ci, offset, data, 16); + return *(m128 *)data; +} + +static rose_inline +m256 getData256(const struct core_info *ci, s64a offset, u32 *valid_data_mask) { + if (offset > 0 && offset + sizeof(m256) <= ci->len) { + *valid_data_mask = ~0u; + return loadu256(ci->buf + offset); + } + ALIGN_AVX_DIRECTIVE u8 data[sizeof(m256)]; + *valid_data_mask = getBufferDataComplex(ci, offset, data, 32); + return *(m256 *)data; +} + +#ifdef HAVE_AVX512 +static rose_inline +m512 getData512(const struct core_info *ci, s64a offset, u64a *valid_data_mask) { + if (offset > 0 && offset + sizeof(m512) <= ci->len) { + *valid_data_mask = ~0ULL; + return loadu512(ci->buf + offset); + } + ALIGN_CL_DIRECTIVE u8 data[sizeof(m512)]; + *valid_data_mask = getBufferDataComplex(ci, offset, data, 64); + return *(m512 *)data; +} +#endif + +static rose_inline +int roseCheckShufti16x8(const struct core_info *ci, const u8 *nib_mask, + const u8 *bucket_select_mask, u32 neg_mask, + s32 checkOffset, u64a end) { + const s64a base_offset = (s64a)end - ci->buf_offset; + s64a offset = base_offset + checkOffset; + DEBUG_PRINTF("end %lld base_offset %lld\n", end, base_offset); + DEBUG_PRINTF("checkOffset %d offset %lld\n", checkOffset, offset); + + if (unlikely(checkOffset < 0 && (u64a)(0 - checkOffset) > end)) { + DEBUG_PRINTF("too early, fail\n"); + return 0; + } + + u32 valid_data_mask = 0; + m128 data = getData128(ci, offset, &valid_data_mask); + if (unlikely(!valid_data_mask)) { + return 1; + } + + m256 nib_mask_m256 = loadu256(nib_mask); + m128 bucket_select_mask_m128 = loadu128(bucket_select_mask); + if (validateShuftiMask16x8(data, nib_mask_m256, + bucket_select_mask_m128, + neg_mask, valid_data_mask)) { + DEBUG_PRINTF("check shufti 16x8 successfully\n"); + return 1; + } else { + return 0; + } +} + +static rose_inline +int roseCheckShufti16x16(const struct core_info *ci, const u8 *hi_mask, + const u8 *lo_mask, const u8 *bucket_select_mask, + u32 neg_mask, s32 checkOffset, u64a end) { + const s64a base_offset = (s64a)end - ci->buf_offset; + s64a offset = base_offset + checkOffset; + DEBUG_PRINTF("end %lld base_offset %lld\n", end, base_offset); + DEBUG_PRINTF("checkOffset %d offset %lld\n", checkOffset, offset); + + if (unlikely(checkOffset < 0 && (u64a)(0 - checkOffset) > end)) { + DEBUG_PRINTF("too early, fail\n"); + return 0; + } + + u32 valid_data_mask = 0; + m128 data = getData128(ci, offset, &valid_data_mask); + if (unlikely(!valid_data_mask)) { + return 1; + } + + m256 data_m256 = set2x128(data); + m256 hi_mask_m256 = loadu256(hi_mask); + m256 lo_mask_m256 = loadu256(lo_mask); + m256 bucket_select_mask_m256 = loadu256(bucket_select_mask); + if (validateShuftiMask16x16(data_m256, hi_mask_m256, lo_mask_m256, + bucket_select_mask_m256, + neg_mask, valid_data_mask)) { + DEBUG_PRINTF("check shufti 16x16 successfully\n"); + return 1; + } else { + return 0; + } +} + +static rose_inline +int roseCheckShufti32x8(const struct core_info *ci, const u8 *hi_mask, + const u8 *lo_mask, const u8 *bucket_select_mask, + u32 neg_mask, s32 checkOffset, u64a end) { + const s64a base_offset = (s64a)end - ci->buf_offset; + s64a offset = base_offset + checkOffset; + DEBUG_PRINTF("end %lld base_offset %lld\n", end, base_offset); + DEBUG_PRINTF("checkOffset %d offset %lld\n", checkOffset, offset); + + if (unlikely(checkOffset < 0 && (u64a)(0 - checkOffset) > end)) { + DEBUG_PRINTF("too early, fail\n"); + return 0; + } + + u32 valid_data_mask = 0; + m256 data = getData256(ci, offset, &valid_data_mask); + if (unlikely(!valid_data_mask)) { + return 1; + } + + m128 hi_mask_m128 = loadu128(hi_mask); + m128 lo_mask_m128 = loadu128(lo_mask); + m256 hi_mask_m256 = set2x128(hi_mask_m128); + m256 lo_mask_m256 = set2x128(lo_mask_m128); + m256 bucket_select_mask_m256 = loadu256(bucket_select_mask); + if (validateShuftiMask32x8(data, hi_mask_m256, lo_mask_m256, + bucket_select_mask_m256, + neg_mask, valid_data_mask)) { + DEBUG_PRINTF("check shufti 32x8 successfully\n"); + return 1; + } else { + return 0; + } +} + +static rose_inline +int roseCheckShufti32x16(const struct core_info *ci, const u8 *hi_mask, + const u8 *lo_mask, const u8 *bucket_select_mask_hi, + const u8 *bucket_select_mask_lo, u32 neg_mask, + s32 checkOffset, u64a end) { + const s64a base_offset = (s64a)end - ci->buf_offset; + s64a offset = base_offset + checkOffset; + DEBUG_PRINTF("end %lld base_offset %lld\n", end, base_offset); + DEBUG_PRINTF("checkOffset %d offset %lld\n", checkOffset, offset); + + if (unlikely(checkOffset < 0 && (u64a)(0 - checkOffset) > end)) { + DEBUG_PRINTF("too early, fail\n"); + return 0; + } + + u32 valid_data_mask = 0; + m256 data = getData256(ci, offset, &valid_data_mask); + if (unlikely(!valid_data_mask)) { + return 1; + } + + m256 hi_mask_1 = loadu2x128(hi_mask); + m256 hi_mask_2 = loadu2x128(hi_mask + 16); + m256 lo_mask_1 = loadu2x128(lo_mask); + m256 lo_mask_2 = loadu2x128(lo_mask + 16); + + m256 bucket_mask_hi = loadu256(bucket_select_mask_hi); + m256 bucket_mask_lo = loadu256(bucket_select_mask_lo); + if (validateShuftiMask32x16(data, hi_mask_1, hi_mask_2, + lo_mask_1, lo_mask_2, bucket_mask_hi, + bucket_mask_lo, neg_mask, valid_data_mask)) { + DEBUG_PRINTF("check shufti 32x16 successfully\n"); + return 1; + } else { + return 0; + } +} + +#ifdef HAVE_AVX512 +static rose_inline +int roseCheckShufti64x8(const struct core_info *ci, const u8 *hi_mask, + const u8 *lo_mask, const u8 *bucket_select_mask, + u64a neg_mask, s32 checkOffset, u64a end) { + const s64a base_offset = (s64a)end - ci->buf_offset; + s64a offset = base_offset + checkOffset; + DEBUG_PRINTF("end %lld base_offset %lld\n", end, base_offset); + DEBUG_PRINTF("checkOffset %d offset %lld\n", checkOffset, offset); + + if (unlikely(checkOffset < 0 && (u64a)(0 - checkOffset) > end)) { + DEBUG_PRINTF("too early, fail\n"); + return 0; + } + + u64a valid_data_mask = 0; + m512 data = getData512(ci, offset, &valid_data_mask); + + if (unlikely(!valid_data_mask)) { + return 1; + } + + m512 hi_mask_m512 = loadu512(hi_mask); + m512 lo_mask_m512 = loadu512(lo_mask); + m512 bucket_select_mask_m512 = loadu512(bucket_select_mask); + if (validateShuftiMask64x8(data, hi_mask_m512, lo_mask_m512, + bucket_select_mask_m512, + neg_mask, valid_data_mask)) { + DEBUG_PRINTF("check shufti 64x8 successfully\n"); + return 1; + } else { + return 0; + } +} + +static rose_inline +int roseCheckShufti64x16(const struct core_info *ci, const u8 *hi_mask_1, + const u8 *hi_mask_2, const u8 *lo_mask_1, + const u8 *lo_mask_2, const u8 *bucket_select_mask_hi, + const u8 *bucket_select_mask_lo, u64a neg_mask, + s32 checkOffset, u64a end) { + const s64a base_offset = (s64a)end - ci->buf_offset; + s64a offset = base_offset + checkOffset; + DEBUG_PRINTF("end %lld base_offset %lld\n", end, base_offset); + DEBUG_PRINTF("checkOffset %d offset %lld\n", checkOffset, offset); + + if (unlikely(checkOffset < 0 && (u64a)(0 - checkOffset) > end)) { + DEBUG_PRINTF("too early, fail\n"); + return 0; + } + + u64a valid_data_mask = 0; + m512 data = getData512(ci, offset, &valid_data_mask); + if (unlikely(!valid_data_mask)) { + return 1; + } + + m512 hi_mask_1_m512 = loadu512(hi_mask_1); + m512 hi_mask_2_m512 = loadu512(hi_mask_2); + m512 lo_mask_1_m512 = loadu512(lo_mask_1); + m512 lo_mask_2_m512 = loadu512(lo_mask_2); + + m512 bucket_select_mask_hi_m512 = loadu512(bucket_select_mask_hi); + m512 bucket_select_mask_lo_m512 = loadu512(bucket_select_mask_lo); + if (validateShuftiMask64x16(data, hi_mask_1_m512, hi_mask_2_m512, + lo_mask_1_m512, lo_mask_2_m512, + bucket_select_mask_hi_m512, + bucket_select_mask_lo_m512, + neg_mask, valid_data_mask)) { + DEBUG_PRINTF("check shufti 64x16 successfully\n"); + return 1; + } else { + return 0; + } +} +#endif + +static rose_inline +int roseCheckSingleLookaround(const struct RoseEngine *t, + const struct hs_scratch *scratch, + s8 checkOffset, u32 lookaroundReachIndex, + u64a end) { + assert(lookaroundReachIndex != MO_INVALID_IDX); + const struct core_info *ci = &scratch->core_info; + DEBUG_PRINTF("end=%llu, buf_offset=%llu, buf_end=%llu\n", end, + ci->buf_offset, ci->buf_offset + ci->len); + + const s64a base_offset = end - ci->buf_offset; + const s64a offset = base_offset + checkOffset; + DEBUG_PRINTF("base_offset=%lld\n", base_offset); + DEBUG_PRINTF("checkOffset=%d offset=%lld\n", checkOffset, offset); + + if (unlikely(checkOffset < 0 && (u64a)(0 - checkOffset) > end)) { + DEBUG_PRINTF("too early, fail\n"); + return 0; + } + + const u8 *reach = getByOffset(t, lookaroundReachIndex); + + u8 c; + if (offset >= 0 && offset < (s64a)ci->len) { + c = ci->buf[offset]; + } else if (offset < 0 && offset >= -(s64a)ci->hlen) { + c = ci->hbuf[ci->hlen + offset]; + } else { + return 1; + } + + if (!reachHasBit(reach, c)) { + DEBUG_PRINTF("char 0x%02x failed reach check\n", c); + return 0; + } + + DEBUG_PRINTF("OK :)\n"); + return 1; +} + +/** + * \brief Scan around a literal, checking that that "lookaround" reach masks + * are satisfied. + */ +static rose_inline +int roseCheckLookaround(const struct RoseEngine *t, + const struct hs_scratch *scratch, + u32 lookaroundLookIndex, u32 lookaroundReachIndex, + u32 lookaroundCount, u64a end) { + assert(lookaroundLookIndex != MO_INVALID_IDX); + assert(lookaroundReachIndex != MO_INVALID_IDX); + assert(lookaroundCount > 0); + + const struct core_info *ci = &scratch->core_info; + DEBUG_PRINTF("end=%llu, buf_offset=%llu, buf_end=%llu\n", end, + ci->buf_offset, ci->buf_offset + ci->len); + + const s8 *look = getByOffset(t, lookaroundLookIndex); + const s8 *look_end = look + lookaroundCount; + assert(look < look_end); + + const u8 *reach = getByOffset(t, lookaroundReachIndex); + + // The following code assumes that the lookaround structures are ordered by + // increasing offset. + + const s64a base_offset = end - ci->buf_offset; + DEBUG_PRINTF("base_offset=%lld\n", base_offset); + DEBUG_PRINTF("first look has offset %d\n", *look); + + // If our first check tells us we need to look at an offset before the + // start of the stream, this role cannot match. + if (unlikely(*look < 0 && (u64a)(0 - *look) > end)) { + DEBUG_PRINTF("too early, fail\n"); + return 0; + } + + // Skip over offsets that are before the history buffer. + do { + s64a offset = base_offset + *look; + if (offset >= -(s64a)ci->hlen) { + goto in_history; + } + DEBUG_PRINTF("look=%d before history\n", *look); + look++; + reach += REACH_BITVECTOR_LEN; + } while (look < look_end); + + // History buffer. + DEBUG_PRINTF("scan history (%zu looks left)\n", look_end - look); + for (; look < look_end; ++look, reach += REACH_BITVECTOR_LEN) { + in_history: + ; + s64a offset = base_offset + *look; + DEBUG_PRINTF("reach=%p, rel offset=%lld\n", reach, offset); + + if (offset >= 0) { + DEBUG_PRINTF("in buffer\n"); + goto in_buffer; + } + + assert(offset >= -(s64a)ci->hlen && offset < 0); + u8 c = ci->hbuf[ci->hlen + offset]; + if (!reachHasBit(reach, c)) { + DEBUG_PRINTF("char 0x%02x failed reach check\n", c); + return 0; + } + } + // Current buffer. + DEBUG_PRINTF("scan buffer (%zu looks left)\n", look_end - look); + for (; look < look_end; ++look, reach += REACH_BITVECTOR_LEN) { + in_buffer: + ; + s64a offset = base_offset + *look; + DEBUG_PRINTF("reach=%p, rel offset=%lld\n", reach, offset); + + if (offset >= (s64a)ci->len) { + DEBUG_PRINTF("in the future\n"); + break; + } + + assert(offset >= 0 && offset < (s64a)ci->len); + u8 c = ci->buf[offset]; + if (!reachHasBit(reach, c)) { + DEBUG_PRINTF("char 0x%02x failed reach check\n", c); + return 0; + } + } + + DEBUG_PRINTF("OK :)\n"); + return 1; +} + +/** + * \brief Trying to find a matching path by the corresponding path mask of + * every lookaround location. + */ +static rose_inline +int roseMultipathLookaround(const struct RoseEngine *t, + const struct hs_scratch *scratch, + u32 multipathLookaroundLookIndex, + u32 multipathLookaroundReachIndex, + u32 multipathLookaroundCount, + s32 last_start, const u8 *start_mask, + u64a end) { + assert(multipathLookaroundCount > 0); + + const struct core_info *ci = &scratch->core_info; + DEBUG_PRINTF("end=%llu, buf_offset=%llu, buf_end=%llu\n", end, + ci->buf_offset, ci->buf_offset + ci->len); + + const s8 *look = getByOffset(t, multipathLookaroundLookIndex); + const s8 *look_end = look + multipathLookaroundCount; + assert(look < look_end); + + const u8 *reach = getByOffset(t, multipathLookaroundReachIndex); + + const s64a base_offset = (s64a)end - ci->buf_offset; + DEBUG_PRINTF("base_offset=%lld\n", base_offset); + + u8 path = 0xff; + + assert(last_start < 0); + + if (unlikely((u64a)(0 - last_start) > end)) { + DEBUG_PRINTF("too early, fail\n"); + return 0; + } + + s8 base_look_offset = *look; + do { + s64a offset = base_offset + *look; + u32 start_offset = (u32)(*look - base_look_offset); + DEBUG_PRINTF("start_mask[%u] = %x\n", start_offset, + start_mask[start_offset]); + path = start_mask[start_offset]; + if (offset >= -(s64a)ci->hlen) { + break; + } + DEBUG_PRINTF("look=%d before history\n", *look); + look++; + reach += MULTI_REACH_BITVECTOR_LEN; + } while (look < look_end); + + DEBUG_PRINTF("scan history (%zu looks left)\n", look_end - look); + for (; look < look_end; ++look, reach += MULTI_REACH_BITVECTOR_LEN) { + s64a offset = base_offset + *look; + DEBUG_PRINTF("reach=%p, rel offset=%lld\n", reach, offset); + + if (offset >= 0) { + DEBUG_PRINTF("in buffer\n"); + break; + } + + assert(offset >= -(s64a)ci->hlen && offset < 0); + u8 c = ci->hbuf[ci->hlen + offset]; + path &= reach[c]; + DEBUG_PRINTF("reach[%x] = %02x path = %0xx\n", c, reach[c], path); + if (!path) { + DEBUG_PRINTF("char 0x%02x failed reach check\n", c); + return 0; + } + } + + DEBUG_PRINTF("scan buffer (%zu looks left)\n", look_end - look); + for(; look < look_end; ++look, reach += MULTI_REACH_BITVECTOR_LEN) { + s64a offset = base_offset + *look; + DEBUG_PRINTF("reach=%p, rel offset=%lld\n", reach, offset); + + if (offset >= (s64a)ci->len) { + DEBUG_PRINTF("in the future\n"); + break; + } + + assert(offset >= 0 && offset < (s64a)ci->len); + u8 c = ci->buf[offset]; + path &= reach[c]; + DEBUG_PRINTF("reach[%x] = %02x path = %0xx\n", c, reach[c], path); + if (!path) { + DEBUG_PRINTF("char 0x%02x failed reach check\n", c); + return 0; + } + } + + DEBUG_PRINTF("OK :)\n"); + return 1; +} + +static never_inline +int roseCheckMultipathShufti16x8(const struct hs_scratch *scratch, + const struct ROSE_STRUCT_CHECK_MULTIPATH_SHUFTI_16x8 *ri, + u64a end) { + const struct core_info *ci = &scratch->core_info; + s32 checkOffset = ri->base_offset; + const s64a base_offset = (s64a)end - ci->buf_offset; + s64a offset = base_offset + checkOffset; + DEBUG_PRINTF("end %lld base_offset %lld\n", end, base_offset); + DEBUG_PRINTF("checkOffset %d offset %lld\n", checkOffset, offset); + + assert(ri->last_start <= 0); + if (unlikely(checkOffset < 0 && (u64a)(0 - checkOffset) > end)) { + if ((u64a)(0 - ri->last_start) > end) { + DEBUG_PRINTF("too early, fail\n"); + return 0; + } + } + + u32 valid_data_mask; + m128 data_init = getData128(ci, offset, &valid_data_mask); + m128 data_select_mask = loadu128(ri->data_select_mask); + + u32 valid_path_mask = 0; + if (unlikely(!(valid_data_mask & 1))) { + DEBUG_PRINTF("lose part of backward data\n"); + DEBUG_PRINTF("valid_data_mask %x\n", valid_data_mask); + + m128 expand_valid; + u64a expand_mask = 0x8080808080808080ULL; + u64a valid_lo = expand64(valid_data_mask & 0xff, expand_mask); + u64a valid_hi = expand64(valid_data_mask >> 8, expand_mask); + DEBUG_PRINTF("expand_hi %llx\n", valid_hi); + DEBUG_PRINTF("expand_lo %llx\n", valid_lo); + expand_valid = set64x2(valid_hi, valid_lo); + valid_path_mask = ~movemask128(pshufb_m128(expand_valid, + data_select_mask)); + } + + m128 data = pshufb_m128(data_init, data_select_mask); + m256 nib_mask = loadu256(ri->nib_mask); + m128 bucket_select_mask = loadu128(ri->bucket_select_mask); + + u32 hi_bits_mask = ri->hi_bits_mask; + u32 lo_bits_mask = ri->lo_bits_mask; + u32 neg_mask = ri->neg_mask; + + if (validateMultipathShuftiMask16x8(data, nib_mask, + bucket_select_mask, + hi_bits_mask, lo_bits_mask, + neg_mask, valid_path_mask)) { + DEBUG_PRINTF("check multi-path shufti-16x8 successfully\n"); + return 1; + } else { + return 0; + } +} + +static never_inline +int roseCheckMultipathShufti32x8(const struct hs_scratch *scratch, + const struct ROSE_STRUCT_CHECK_MULTIPATH_SHUFTI_32x8 *ri, + u64a end) { + const struct core_info *ci = &scratch->core_info; + s32 checkOffset = ri->base_offset; + const s64a base_offset = (s64a)end - ci->buf_offset; + s64a offset = base_offset + checkOffset; + DEBUG_PRINTF("end %lld base_offset %lld\n", end, base_offset); + DEBUG_PRINTF("checkOffset %d offset %lld\n", checkOffset, offset); + + assert(ri->last_start <= 0); + if (unlikely(checkOffset < 0 && (u64a)(0 - checkOffset) > end)) { + if ((u64a)(0 - ri->last_start) > end) { + DEBUG_PRINTF("too early, fail\n"); + return 0; + } + } + + u32 valid_data_mask; + m128 data_m128 = getData128(ci, offset, &valid_data_mask); + m256 data_double = set2x128(data_m128); + m256 data_select_mask = loadu256(ri->data_select_mask); + + u32 valid_path_mask = 0; + m256 expand_valid; + if (unlikely(!(valid_data_mask & 1))) { + DEBUG_PRINTF("lose part of backward data\n"); + DEBUG_PRINTF("valid_data_mask %x\n", valid_data_mask); + + u64a expand_mask = 0x8080808080808080ULL; + u64a valid_lo = expand64(valid_data_mask & 0xff, expand_mask); + u64a valid_hi = expand64(valid_data_mask >> 8, expand_mask); + DEBUG_PRINTF("expand_hi %llx\n", valid_hi); + DEBUG_PRINTF("expand_lo %llx\n", valid_lo); + expand_valid = set64x4(valid_hi, valid_lo, valid_hi, + valid_lo); + valid_path_mask = ~movemask256(pshufb_m256(expand_valid, + data_select_mask)); + } + + m256 data = pshufb_m256(data_double, data_select_mask); + m256 hi_mask = loadu2x128(ri->hi_mask); + m256 lo_mask = loadu2x128(ri->lo_mask); + m256 bucket_select_mask = loadu256(ri->bucket_select_mask); + + u32 hi_bits_mask = ri->hi_bits_mask; + u32 lo_bits_mask = ri->lo_bits_mask; + u32 neg_mask = ri->neg_mask; + + if (validateMultipathShuftiMask32x8(data, hi_mask, lo_mask, + bucket_select_mask, + hi_bits_mask, lo_bits_mask, + neg_mask, valid_path_mask)) { + DEBUG_PRINTF("check multi-path shufti-32x8 successfully\n"); + return 1; + } else { + return 0; + } +} + +static never_inline +int roseCheckMultipathShufti32x16(const struct hs_scratch *scratch, + const struct ROSE_STRUCT_CHECK_MULTIPATH_SHUFTI_32x16 *ri, + u64a end) { + const struct core_info *ci = &scratch->core_info; + const s64a base_offset = (s64a)end - ci->buf_offset; + s32 checkOffset = ri->base_offset; + s64a offset = base_offset + checkOffset; + DEBUG_PRINTF("end %lld base_offset %lld\n", end, base_offset); + DEBUG_PRINTF("checkOffset %d offset %lld\n", checkOffset, offset); + + assert(ri->last_start <= 0); + if (unlikely(checkOffset < 0 && (u64a)(0 - checkOffset) > end)) { + if ((u64a)(0 - ri->last_start) > end) { + DEBUG_PRINTF("too early, fail\n"); + return 0; + } + } + + u32 valid_data_mask; + m128 data_m128 = getData128(ci, offset, &valid_data_mask); + m256 data_double = set2x128(data_m128); + m256 data_select_mask = loadu256(ri->data_select_mask); + + u32 valid_path_mask = 0; + m256 expand_valid; + if (unlikely(!(valid_data_mask & 1))) { + DEBUG_PRINTF("lose part of backward data\n"); + DEBUG_PRINTF("valid_data_mask %x\n", valid_data_mask); + + u64a expand_mask = 0x8080808080808080ULL; + u64a valid_lo = expand64(valid_data_mask & 0xff, expand_mask); + u64a valid_hi = expand64(valid_data_mask >> 8, expand_mask); + DEBUG_PRINTF("expand_hi %llx\n", valid_hi); + DEBUG_PRINTF("expand_lo %llx\n", valid_lo); + expand_valid = set64x4(valid_hi, valid_lo, valid_hi, + valid_lo); + valid_path_mask = ~movemask256(pshufb_m256(expand_valid, + data_select_mask)); + } + + m256 data = pshufb_m256(data_double, data_select_mask); + + m256 hi_mask_1 = loadu2x128(ri->hi_mask); + m256 hi_mask_2 = loadu2x128(ri->hi_mask + 16); + m256 lo_mask_1 = loadu2x128(ri->lo_mask); + m256 lo_mask_2 = loadu2x128(ri->lo_mask + 16); + + m256 bucket_select_mask_hi = loadu256(ri->bucket_select_mask_hi); + m256 bucket_select_mask_lo = loadu256(ri->bucket_select_mask_lo); + + u32 hi_bits_mask = ri->hi_bits_mask; + u32 lo_bits_mask = ri->lo_bits_mask; + u32 neg_mask = ri->neg_mask; + + if (validateMultipathShuftiMask32x16(data, hi_mask_1, hi_mask_2, + lo_mask_1, lo_mask_2, + bucket_select_mask_hi, + bucket_select_mask_lo, + hi_bits_mask, lo_bits_mask, + neg_mask, valid_path_mask)) { + DEBUG_PRINTF("check multi-path shufti-32x16 successfully\n"); + return 1; + } else { + return 0; + } +} + +static never_inline +int roseCheckMultipathShufti64(const struct hs_scratch *scratch, + const struct ROSE_STRUCT_CHECK_MULTIPATH_SHUFTI_64 *ri, + u64a end) { + const struct core_info *ci = &scratch->core_info; + const s64a base_offset = (s64a)end - ci->buf_offset; + s32 checkOffset = ri->base_offset; + s64a offset = base_offset + checkOffset; + DEBUG_PRINTF("end %lld base_offset %lld\n", end, base_offset); + DEBUG_PRINTF("checkOffset %d offset %lld\n", checkOffset, offset); + + if (unlikely(checkOffset < 0 && (u64a)(0 - checkOffset) > end)) { + if ((u64a)(0 - ri->last_start) > end) { + DEBUG_PRINTF("too early, fail\n"); + return 0; + } + } + + u32 valid_data_mask; + m128 data_m128 = getData128(ci, offset, &valid_data_mask); + m256 data_m256 = set2x128(data_m128); + m256 data_select_mask_1 = loadu256(ri->data_select_mask); + m256 data_select_mask_2 = loadu256(ri->data_select_mask + 32); + + u64a valid_path_mask = 0; + m256 expand_valid; + if (unlikely(!(valid_data_mask & 1))) { + DEBUG_PRINTF("lose part of backward data\n"); + DEBUG_PRINTF("valid_data_mask %x\n", valid_data_mask); + + u64a expand_mask = 0x8080808080808080ULL; + u64a valid_lo = expand64(valid_data_mask & 0xff, expand_mask); + u64a valid_hi = expand64(valid_data_mask >> 8, expand_mask); + DEBUG_PRINTF("expand_hi %llx\n", valid_hi); + DEBUG_PRINTF("expand_lo %llx\n", valid_lo); + expand_valid = set64x4(valid_hi, valid_lo, valid_hi, + valid_lo); + u32 valid_path_1 = movemask256(pshufb_m256(expand_valid, + data_select_mask_1)); + u32 valid_path_2 = movemask256(pshufb_m256(expand_valid, + data_select_mask_2)); + valid_path_mask = ~((u64a)valid_path_1 | (u64a)valid_path_2 << 32); + } + + m256 data_1 = pshufb_m256(data_m256, data_select_mask_1); + m256 data_2 = pshufb_m256(data_m256, data_select_mask_2); + + m256 hi_mask = loadu2x128(ri->hi_mask); + m256 lo_mask = loadu2x128(ri->lo_mask); + + m256 bucket_select_mask_1 = loadu256(ri->bucket_select_mask); + m256 bucket_select_mask_2 = loadu256(ri->bucket_select_mask + 32); + + u64a hi_bits_mask = ri->hi_bits_mask; + u64a lo_bits_mask = ri->lo_bits_mask; + u64a neg_mask = ri->neg_mask; + + if (validateMultipathShuftiMask64(data_1, data_2, hi_mask, lo_mask, + bucket_select_mask_1, + bucket_select_mask_2, hi_bits_mask, + lo_bits_mask, neg_mask, + valid_path_mask)) { + DEBUG_PRINTF("check multi-path shufti-64 successfully\n"); + return 1; + } else { + return 0; + } +} + +static rose_inline +int roseNfaEarliestSom(u64a start, UNUSED u64a end, UNUSED ReportID id, + void *context) { + assert(context); + u64a *som = context; + *som = MIN(*som, start); + return MO_CONTINUE_MATCHING; +} + +static rose_inline +u64a roseGetHaigSom(const struct RoseEngine *t, struct hs_scratch *scratch, + const u32 qi, UNUSED const u32 leftfixLag) { + u32 ri = queueToLeftIndex(t, qi); + + UNUSED const struct LeftNfaInfo *left = getLeftTable(t) + ri; + + DEBUG_PRINTF("testing %s prefix %u/%u with lag %u (maxLag=%u)\n", + left->transient ? "transient" : "active", ri, qi, + leftfixLag, left->maxLag); + + assert(leftfixLag <= left->maxLag); + + struct mq *q = scratch->queues + qi; + + u64a start = ~0ULL; + + /* switch the callback + context for a fun one */ + q->cb = roseNfaEarliestSom; + q->context = &start; + + nfaReportCurrentMatches(q->nfa, q); + + /* restore the old callback + context */ + q->cb = roseNfaAdaptor; + q->context = NULL; + DEBUG_PRINTF("earliest som is %llu\n", start); + return start; +} + +static rose_inline +char roseCheckBounds(u64a end, u64a min_bound, u64a max_bound) { + DEBUG_PRINTF("check offset=%llu against bounds [%llu,%llu]\n", end, + min_bound, max_bound); + assert(min_bound <= max_bound); + return end >= min_bound && end <= max_bound; +} + +static rose_inline +hwlmcb_rv_t roseEnginesEod(const struct RoseEngine *rose, + struct hs_scratch *scratch, u64a offset, + u32 iter_offset) { + const char is_streaming = rose->mode != HS_MODE_BLOCK; + + /* data, len is used for state decompress, should be full available data */ + u8 key = 0; + if (is_streaming) { + const u8 *eod_data = scratch->core_info.hbuf; + size_t eod_len = scratch->core_info.hlen; + key = eod_len ? eod_data[eod_len - 1] : 0; + } + + const u8 *aa = getActiveLeafArray(rose, scratch->core_info.state); + const u32 aaCount = rose->activeArrayCount; + const u32 qCount = rose->queueCount; + struct fatbit *aqa = scratch->aqa; + + const struct mmbit_sparse_iter *it = getByOffset(rose, iter_offset); + assert(ISALIGNED(it)); + + u32 idx = 0; + struct mmbit_sparse_state si_state[MAX_SPARSE_ITER_STATES]; + + for (u32 qi = mmbit_sparse_iter_begin(aa, aaCount, &idx, it, si_state); + qi != MMB_INVALID; + qi = mmbit_sparse_iter_next(aa, aaCount, qi, &idx, it, si_state)) { + DEBUG_PRINTF("checking nfa %u\n", qi); + struct mq *q = scratch->queues + qi; + if (!fatbit_set(aqa, qCount, qi)) { + initQueue(q, qi, rose, scratch); + } + + assert(q->nfa == getNfaByQueue(rose, qi)); + assert(nfaAcceptsEod(q->nfa)); + + if (is_streaming) { + // Decompress stream state. + nfaExpandState(q->nfa, q->state, q->streamState, offset, key); + } + + if (nfaCheckFinalState(q->nfa, q->state, q->streamState, offset, + roseReportAdaptor, + scratch) == MO_HALT_MATCHING) { + DEBUG_PRINTF("user instructed us to stop\n"); + return HWLM_TERMINATE_MATCHING; + } + } + + return HWLM_CONTINUE_MATCHING; +} + +static rose_inline +hwlmcb_rv_t roseSuffixesEod(const struct RoseEngine *rose, + struct hs_scratch *scratch, u64a offset) { + const u8 *aa = getActiveLeafArray(rose, scratch->core_info.state); + const u32 aaCount = rose->activeArrayCount; + + for (u32 qi = mmbit_iterate(aa, aaCount, MMB_INVALID); qi != MMB_INVALID; + qi = mmbit_iterate(aa, aaCount, qi)) { + DEBUG_PRINTF("checking nfa %u\n", qi); + struct mq *q = scratch->queues + qi; + assert(q->nfa == getNfaByQueue(rose, qi)); + assert(nfaAcceptsEod(q->nfa)); + + /* We have just been triggered. */ + assert(fatbit_isset(scratch->aqa, rose->queueCount, qi)); + + pushQueueNoMerge(q, MQE_END, scratch->core_info.len); + q->context = NULL; + + /* rose exec is used as we don't want to / can't raise matches in the + * history buffer. */ + if (!nfaQueueExecRose(q->nfa, q, MO_INVALID_IDX)) { + DEBUG_PRINTF("nfa is dead\n"); + continue; + } + if (nfaCheckFinalState(q->nfa, q->state, q->streamState, offset, + roseReportAdaptor, + scratch) == MO_HALT_MATCHING) { + DEBUG_PRINTF("user instructed us to stop\n"); + return HWLM_TERMINATE_MATCHING; + } + } + return HWLM_CONTINUE_MATCHING; +} + +static rose_inline +hwlmcb_rv_t roseMatcherEod(const struct RoseEngine *rose, + struct hs_scratch *scratch, u64a offset) { + assert(rose->ematcherOffset); + assert(rose->ematcherRegionSize); + + // Clear role state and active engines, since we have already handled all + // outstanding work there. + DEBUG_PRINTF("clear role state and active leaf array\n"); + char *state = scratch->core_info.state; + mmbit_clear(getRoleState(state), rose->rolesWithStateCount); + mmbit_clear(getActiveLeafArray(rose, state), rose->activeArrayCount); + + const char is_streaming = rose->mode != HS_MODE_BLOCK; + + size_t eod_len; + const u8 *eod_data; + if (!is_streaming) { /* Block */ + eod_data = scratch->core_info.buf; + eod_len = scratch->core_info.len; + } else { /* Streaming */ + eod_len = scratch->core_info.hlen; + eod_data = scratch->core_info.hbuf; + } + + assert(eod_data); + assert(eod_len); + + DEBUG_PRINTF("%zu bytes of eod data to scan at offset %llu\n", eod_len, + offset); + + // If we don't have enough bytes to produce a match from an EOD table scan, + // there's no point scanning. + if (eod_len < rose->eodmatcherMinWidth) { + DEBUG_PRINTF("too short for min width %u\n", rose->eodmatcherMinWidth); + return HWLM_CONTINUE_MATCHING; + } + + // Ensure that we only need scan the last N bytes, where N is the length of + // the eod-anchored matcher region. + size_t adj = eod_len - MIN(eod_len, rose->ematcherRegionSize); + + const struct HWLM *etable = getByOffset(rose, rose->ematcherOffset); + hwlmExec(etable, eod_data, eod_len, adj, roseCallback, scratch, + scratch->tctxt.groups); + + // We may need to fire delayed matches. + if (cleanUpDelayed(rose, scratch, 0, offset) == HWLM_TERMINATE_MATCHING) { + DEBUG_PRINTF("user instructed us to stop\n"); + return HWLM_TERMINATE_MATCHING; + } + + roseFlushLastByteHistory(rose, scratch, offset); + return HWLM_CONTINUE_MATCHING; +} + +static rose_inline +int roseCheckLongLiteral(const struct RoseEngine *t, + const struct hs_scratch *scratch, u64a end, + u32 lit_offset, u32 lit_length, char nocase) { + const struct core_info *ci = &scratch->core_info; + const u8 *lit = getByOffset(t, lit_offset); + + DEBUG_PRINTF("check lit at %llu, length %u\n", end, lit_length); + DEBUG_PRINTF("base buf_offset=%llu\n", ci->buf_offset); + + if (end < lit_length) { + DEBUG_PRINTF("too short!\n"); + return 0; + } + + // If any portion of the literal matched in the current buffer, check it. + if (end > ci->buf_offset) { + u32 scan_len = MIN(end - ci->buf_offset, lit_length); + u64a scan_start = end - ci->buf_offset - scan_len; + DEBUG_PRINTF("checking suffix (%u bytes) in buf[%llu:%llu]\n", scan_len, + scan_start, end); + if (cmpForward(ci->buf + scan_start, lit + lit_length - scan_len, + scan_len, nocase)) { + DEBUG_PRINTF("cmp of suffix failed\n"); + return 0; + } + } + + // If the entirety of the literal was in the current block, we are done. + if (end - lit_length >= ci->buf_offset) { + DEBUG_PRINTF("literal confirmed in current block\n"); + return 1; + } + + // We still have a prefix which we must test against the buffer prepared by + // the long literal table. This is only done in streaming mode. + + assert(t->mode != HS_MODE_BLOCK); + + const u8 *ll_buf; + size_t ll_len; + if (nocase) { + ll_buf = scratch->tctxt.ll_buf_nocase; + ll_len = scratch->tctxt.ll_len_nocase; + } else { + ll_buf = scratch->tctxt.ll_buf; + ll_len = scratch->tctxt.ll_len; + } + + assert(ll_buf); + + u64a lit_start_offset = end - lit_length; + u32 prefix_len = MIN(lit_length, ci->buf_offset - lit_start_offset); + u32 hist_rewind = ci->buf_offset - lit_start_offset; + DEBUG_PRINTF("ll_len=%zu, hist_rewind=%u\n", ll_len, hist_rewind); + if (hist_rewind > ll_len) { + DEBUG_PRINTF("not enough history\n"); + return 0; + } + + DEBUG_PRINTF("check prefix len=%u from hist (len %zu, rewind %u)\n", + prefix_len, ll_len, hist_rewind); + assert(hist_rewind <= ll_len); + if (cmpForward(ll_buf + ll_len - hist_rewind, lit, prefix_len, nocase)) { + DEBUG_PRINTF("cmp of prefix failed\n"); + return 0; + } + + DEBUG_PRINTF("cmp succeeded\n"); + return 1; +} + +static rose_inline +int roseCheckMediumLiteral(const struct RoseEngine *t, + const struct hs_scratch *scratch, u64a end, + u32 lit_offset, u32 lit_length, char nocase) { + const struct core_info *ci = &scratch->core_info; + const u8 *lit = getByOffset(t, lit_offset); + + DEBUG_PRINTF("check lit at %llu, length %u\n", end, lit_length); + DEBUG_PRINTF("base buf_offset=%llu\n", ci->buf_offset); + + if (end < lit_length) { + DEBUG_PRINTF("too short!\n"); + return 0; + } + + // If any portion of the literal matched in the current buffer, check it. + if (end > ci->buf_offset) { + u32 scan_len = MIN(end - ci->buf_offset, lit_length); + u64a scan_start = end - ci->buf_offset - scan_len; + DEBUG_PRINTF("checking suffix (%u bytes) in buf[%llu:%llu]\n", scan_len, + scan_start, end); + if (cmpForward(ci->buf + scan_start, lit + lit_length - scan_len, + scan_len, nocase)) { + DEBUG_PRINTF("cmp of suffix failed\n"); + return 0; + } + } + + // If the entirety of the literal was in the current block, we are done. + if (end - lit_length >= ci->buf_offset) { + DEBUG_PRINTF("literal confirmed in current block\n"); + return 1; + } + + // We still have a prefix which we must test against the history buffer. + assert(t->mode != HS_MODE_BLOCK); + + u64a lit_start_offset = end - lit_length; + u32 prefix_len = MIN(lit_length, ci->buf_offset - lit_start_offset); + u32 hist_rewind = ci->buf_offset - lit_start_offset; + DEBUG_PRINTF("hlen=%zu, hist_rewind=%u\n", ci->hlen, hist_rewind); + + // History length check required for confirm in the EOD and delayed + // rebuild paths. + if (hist_rewind > ci->hlen) { + DEBUG_PRINTF("not enough history\n"); + return 0; + } + + DEBUG_PRINTF("check prefix len=%u from hist (len %zu, rewind %u)\n", + prefix_len, ci->hlen, hist_rewind); + assert(hist_rewind <= ci->hlen); + if (cmpForward(ci->hbuf + ci->hlen - hist_rewind, lit, prefix_len, + nocase)) { + DEBUG_PRINTF("cmp of prefix failed\n"); + return 0; + } + + DEBUG_PRINTF("cmp succeeded\n"); + return 1; +} + +static +void updateSeqPoint(struct RoseContext *tctxt, u64a offset, + const char from_mpv) { + if (from_mpv) { + updateMinMatchOffsetFromMpv(tctxt, offset); + } else { + updateMinMatchOffset(tctxt, offset); + } +} + +static rose_inline +hwlmcb_rv_t flushActiveCombinations(const struct RoseEngine *t, + struct hs_scratch *scratch) { + u8 *cvec = (u8 *)scratch->core_info.combVector; + if (!mmbit_any(cvec, t->ckeyCount)) { + return HWLM_CONTINUE_MATCHING; + } + u64a end = scratch->tctxt.lastCombMatchOffset; + for (u32 i = mmbit_iterate(cvec, t->ckeyCount, MMB_INVALID); + i != MMB_INVALID; i = mmbit_iterate(cvec, t->ckeyCount, i)) { + const struct CombInfo *combInfoMap = (const struct CombInfo *) + ((const char *)t + t->combInfoMapOffset); + const struct CombInfo *ci = combInfoMap + i; + if ((ci->min_offset != 0) && (end < ci->min_offset)) { + DEBUG_PRINTF("halt: before min_offset=%llu\n", ci->min_offset); + continue; + } + if ((ci->max_offset != MAX_OFFSET) && (end > ci->max_offset)) { + DEBUG_PRINTF("halt: after max_offset=%llu\n", ci->max_offset); + continue; + } + + DEBUG_PRINTF("check ekey %u\n", ci->ekey); + if (ci->ekey != INVALID_EKEY) { + assert(ci->ekey < t->ekeyCount); + const char *evec = scratch->core_info.exhaustionVector; + if (isExhausted(t, evec, ci->ekey)) { + DEBUG_PRINTF("ekey %u already set, match is exhausted\n", + ci->ekey); + continue; + } + } + + DEBUG_PRINTF("check ckey %u\n", i); + char *lvec = scratch->core_info.logicalVector; + if (!isLogicalCombination(t, lvec, ci->start, ci->result)) { + DEBUG_PRINTF("Logical Combination Failed!\n"); + continue; + } + + DEBUG_PRINTF("Logical Combination Passed!\n"); + if (roseReportComb(t, scratch, end, ci->id, 0, + ci->ekey) == HWLM_TERMINATE_MATCHING) { + return HWLM_TERMINATE_MATCHING; + } + } + clearCvec(t, (char *)cvec); + return HWLM_CONTINUE_MATCHING; +} + +static rose_inline +hwlmcb_rv_t checkPurelyNegatives(const struct RoseEngine *t, + struct hs_scratch *scratch, u64a end) { + for (u32 i = 0; i < t->ckeyCount; i++) { + const struct CombInfo *combInfoMap = (const struct CombInfo *) + ((const char *)t + t->combInfoMapOffset); + const struct CombInfo *ci = combInfoMap + i; + if ((ci->min_offset != 0) && (end < ci->min_offset)) { + DEBUG_PRINTF("halt: before min_offset=%llu\n", ci->min_offset); + continue; + } + if ((ci->max_offset != MAX_OFFSET) && (end > ci->max_offset)) { + DEBUG_PRINTF("halt: after max_offset=%llu\n", ci->max_offset); + continue; + } + + DEBUG_PRINTF("check ekey %u\n", ci->ekey); + if (ci->ekey != INVALID_EKEY) { + assert(ci->ekey < t->ekeyCount); + const char *evec = scratch->core_info.exhaustionVector; + if (isExhausted(t, evec, ci->ekey)) { + DEBUG_PRINTF("ekey %u already set, match is exhausted\n", + ci->ekey); + continue; + } + } + + DEBUG_PRINTF("check ckey %u purely negative\n", i); + char *lvec = scratch->core_info.logicalVector; + if (!isPurelyNegativeMatch(t, lvec, ci->start, ci->result)) { + DEBUG_PRINTF("Logical Combination from purely negative Failed!\n"); + continue; + } + + DEBUG_PRINTF("Logical Combination from purely negative Passed!\n"); + if (roseReportComb(t, scratch, end, ci->id, 0, + ci->ekey) == HWLM_TERMINATE_MATCHING) { + return HWLM_TERMINATE_MATCHING; + } + } + return HWLM_CONTINUE_MATCHING; +} + +#if !defined(_WIN32) +#define PROGRAM_CASE(name) \ + case ROSE_INSTR_##name: { \ + LABEL_ROSE_INSTR_##name: \ + DEBUG_PRINTF("instruction: " #name " (pc=%u)\n", \ + programOffset + (u32)(pc - pc_base)); \ + const struct ROSE_STRUCT_##name *ri = \ + (const struct ROSE_STRUCT_##name *)pc; + +#define PROGRAM_NEXT_INSTRUCTION \ + pc += ROUNDUP_N(sizeof(*ri), ROSE_INSTR_MIN_ALIGN); \ + goto *(next_instr[*(const u8 *)pc]); \ + } + +#define PROGRAM_NEXT_INSTRUCTION_JUMP \ + goto *(next_instr[*(const u8 *)pc]); +#else +#define PROGRAM_CASE(name) \ + case ROSE_INSTR_##name: { \ + DEBUG_PRINTF("instruction: " #name " (pc=%u)\n", \ + programOffset + (u32)(pc - pc_base)); \ + const struct ROSE_STRUCT_##name *ri = \ + (const struct ROSE_STRUCT_##name *)pc; + +#define PROGRAM_NEXT_INSTRUCTION \ + pc += ROUNDUP_N(sizeof(*ri), ROSE_INSTR_MIN_ALIGN); \ + break; \ + } + +#define PROGRAM_NEXT_INSTRUCTION_JUMP continue; +#endif + +hwlmcb_rv_t roseRunProgram(const struct RoseEngine *t, + struct hs_scratch *scratch, u32 programOffset, + u64a som, u64a end, u8 prog_flags) { + DEBUG_PRINTF("program=%u, offsets [%llu,%llu], flags=%u\n", programOffset, + som, end, prog_flags); + + if (programOffset != ROSE_INVALID_PROG_OFFSET) + assert(programOffset >= sizeof(struct RoseEngine)); + assert(programOffset < t->size); + + const char in_anchored = prog_flags & ROSE_PROG_FLAG_IN_ANCHORED; + const char in_catchup = prog_flags & ROSE_PROG_FLAG_IN_CATCHUP; + const char from_mpv = prog_flags & ROSE_PROG_FLAG_FROM_MPV; + const char skip_mpv_catchup = prog_flags & ROSE_PROG_FLAG_SKIP_MPV_CATCHUP; + + const char *pc_base = getByOffset(t, programOffset); + const char *pc = pc_base; + + // Local sparse iterator state for programs that use the SPARSE_ITER_BEGIN + // and SPARSE_ITER_NEXT instructions. + struct mmbit_sparse_state si_state[MAX_SPARSE_ITER_STATES]; + + // If this program has an effect, work_done will be set to one (which may + // allow the program to squash groups). + int work_done = 0; + + struct RoseContext *tctxt = &scratch->tctxt; + +#if !defined(_WIN32) + static const void *next_instr[] = { + &&LABEL_ROSE_INSTR_END, //!< End of program. + &&LABEL_ROSE_INSTR_ANCHORED_DELAY, //!< Delay until after anchored matcher. + &&LABEL_ROSE_INSTR_CHECK_LIT_EARLY, //!< Skip matches before floating min offset. + &&LABEL_ROSE_INSTR_CHECK_GROUPS, //!< Check that literal groups are on. + &&LABEL_ROSE_INSTR_CHECK_ONLY_EOD, //!< Role matches only at EOD. + &&LABEL_ROSE_INSTR_CHECK_BOUNDS, //!< Bounds on distance from offset 0. + &&LABEL_ROSE_INSTR_CHECK_NOT_HANDLED, //!< Test & set role in "handled". + &&LABEL_ROSE_INSTR_CHECK_SINGLE_LOOKAROUND, //!< Single lookaround check. + &&LABEL_ROSE_INSTR_CHECK_LOOKAROUND, //!< Lookaround check. + &&LABEL_ROSE_INSTR_CHECK_MASK, //!< 8-bytes mask check. + &&LABEL_ROSE_INSTR_CHECK_MASK_32, //!< 32-bytes and/cmp/neg mask check. + &&LABEL_ROSE_INSTR_CHECK_BYTE, //!< Single Byte check. + &&LABEL_ROSE_INSTR_CHECK_SHUFTI_16x8, //!< Check 16-byte data by 8-bucket shufti. + &&LABEL_ROSE_INSTR_CHECK_SHUFTI_32x8, //!< Check 32-byte data by 8-bucket shufti. + &&LABEL_ROSE_INSTR_CHECK_SHUFTI_16x16, //!< Check 16-byte data by 16-bucket shufti. + &&LABEL_ROSE_INSTR_CHECK_SHUFTI_32x16, //!< Check 32-byte data by 16-bucket shufti. + &&LABEL_ROSE_INSTR_CHECK_INFIX, //!< Infix engine must be in accept state. + &&LABEL_ROSE_INSTR_CHECK_PREFIX, //!< Prefix engine must be in accept state. + &&LABEL_ROSE_INSTR_PUSH_DELAYED, //!< Push delayed literal matches. + &&LABEL_ROSE_INSTR_DUMMY_NOP, //!< NOP. Should not exist in build programs. + &&LABEL_ROSE_INSTR_CATCH_UP, //!< Catch up engines, anchored matches. + &&LABEL_ROSE_INSTR_CATCH_UP_MPV, //!< Catch up the MPV. + &&LABEL_ROSE_INSTR_SOM_ADJUST, //!< Set SOM from a distance to EOM. + &&LABEL_ROSE_INSTR_SOM_LEFTFIX, //!< Acquire SOM from a leftfix engine. + &&LABEL_ROSE_INSTR_SOM_FROM_REPORT, //!< Acquire SOM from a som_operation. + &&LABEL_ROSE_INSTR_SOM_ZERO, //!< Set SOM to zero. + &&LABEL_ROSE_INSTR_TRIGGER_INFIX, //!< Trigger an infix engine. + &&LABEL_ROSE_INSTR_TRIGGER_SUFFIX, //!< Trigger a suffix engine. + &&LABEL_ROSE_INSTR_DEDUPE, //!< Run deduplication for report. + &&LABEL_ROSE_INSTR_DEDUPE_SOM, //!< Run deduplication for SOM report. + &&LABEL_ROSE_INSTR_REPORT_CHAIN, //!< Fire a chained report (MPV). + &&LABEL_ROSE_INSTR_REPORT_SOM_INT, //!< Manipulate SOM only. + &&LABEL_ROSE_INSTR_REPORT_SOM_AWARE, //!< Manipulate SOM from SOM-aware source. + &&LABEL_ROSE_INSTR_REPORT, + &&LABEL_ROSE_INSTR_REPORT_EXHAUST, + &&LABEL_ROSE_INSTR_REPORT_SOM, + &&LABEL_ROSE_INSTR_REPORT_SOM_EXHAUST, + &&LABEL_ROSE_INSTR_DEDUPE_AND_REPORT, + &&LABEL_ROSE_INSTR_FINAL_REPORT, + &&LABEL_ROSE_INSTR_CHECK_EXHAUSTED, //!< Check if an ekey has already been set. + &&LABEL_ROSE_INSTR_CHECK_MIN_LENGTH, //!< Check (EOM - SOM) against min length. + &&LABEL_ROSE_INSTR_SET_STATE, //!< Switch a state index on. + &&LABEL_ROSE_INSTR_SET_GROUPS, //!< Set some literal group bits. + &&LABEL_ROSE_INSTR_SQUASH_GROUPS, //!< Conditionally turn off some groups. + &&LABEL_ROSE_INSTR_CHECK_STATE, //!< Test a single bit in the state multibit. + &&LABEL_ROSE_INSTR_SPARSE_ITER_BEGIN, //!< Begin running a sparse iter over states. + &&LABEL_ROSE_INSTR_SPARSE_ITER_NEXT, //!< Continue running sparse iter over states. + &&LABEL_ROSE_INSTR_SPARSE_ITER_ANY, //!< Test for any bit in the sparse iterator. + &&LABEL_ROSE_INSTR_ENGINES_EOD, + &&LABEL_ROSE_INSTR_SUFFIXES_EOD, + &&LABEL_ROSE_INSTR_MATCHER_EOD, + &&LABEL_ROSE_INSTR_CHECK_LONG_LIT, + &&LABEL_ROSE_INSTR_CHECK_LONG_LIT_NOCASE, + &&LABEL_ROSE_INSTR_CHECK_MED_LIT, + &&LABEL_ROSE_INSTR_CHECK_MED_LIT_NOCASE, + &&LABEL_ROSE_INSTR_CLEAR_WORK_DONE, + &&LABEL_ROSE_INSTR_MULTIPATH_LOOKAROUND, + &&LABEL_ROSE_INSTR_CHECK_MULTIPATH_SHUFTI_16x8, + &&LABEL_ROSE_INSTR_CHECK_MULTIPATH_SHUFTI_32x8, + &&LABEL_ROSE_INSTR_CHECK_MULTIPATH_SHUFTI_32x16, + &&LABEL_ROSE_INSTR_CHECK_MULTIPATH_SHUFTI_64, + &&LABEL_ROSE_INSTR_INCLUDED_JUMP, + &&LABEL_ROSE_INSTR_SET_LOGICAL, + &&LABEL_ROSE_INSTR_SET_COMBINATION, + &&LABEL_ROSE_INSTR_FLUSH_COMBINATION, + &&LABEL_ROSE_INSTR_SET_EXHAUST, + &&LABEL_ROSE_INSTR_LAST_FLUSH_COMBINATION +#ifdef HAVE_AVX512 + , + &&LABEL_ROSE_INSTR_CHECK_SHUFTI_64x8, //!< Check 64-byte data by 8-bucket shufti. + &&LABEL_ROSE_INSTR_CHECK_SHUFTI_64x16, //!< Check 64-byte data by 16-bucket shufti. + &&LABEL_ROSE_INSTR_CHECK_MASK_64 //!< 64-bytes and/cmp/neg mask check. +#endif + }; +#endif + + for (;;) { + assert(ISALIGNED_N(pc, ROSE_INSTR_MIN_ALIGN)); + assert(pc >= pc_base); + assert((size_t)(pc - pc_base) < t->size); + const u8 code = *(const u8 *)pc; + assert(code <= LAST_ROSE_INSTRUCTION); + + switch ((enum RoseInstructionCode)code) { + PROGRAM_CASE(END) { + DEBUG_PRINTF("finished\n"); + return HWLM_CONTINUE_MATCHING; + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(ANCHORED_DELAY) { + if (in_anchored && end > t->floatingMinLiteralMatchOffset) { + DEBUG_PRINTF("delay until playback\n"); + tctxt->groups |= ri->groups; + work_done = 1; + recordAnchoredLiteralMatch(t, scratch, ri->anch_id, end); + + assert(ri->done_jump); // must progress + pc += ri->done_jump; + PROGRAM_NEXT_INSTRUCTION_JUMP + } + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(CHECK_LIT_EARLY) { + if (end < ri->min_offset) { + DEBUG_PRINTF("halt: before min_offset=%u\n", + ri->min_offset); + assert(ri->fail_jump); // must progress + pc += ri->fail_jump; + PROGRAM_NEXT_INSTRUCTION_JUMP + } + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(CHECK_GROUPS) { + DEBUG_PRINTF("groups=0x%llx, checking instr groups=0x%llx\n", + tctxt->groups, ri->groups); + if (!(ri->groups & tctxt->groups)) { + DEBUG_PRINTF("halt: no groups are set\n"); + return HWLM_CONTINUE_MATCHING; + } + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(CHECK_ONLY_EOD) { + struct core_info *ci = &scratch->core_info; + if (end != ci->buf_offset + ci->len) { + DEBUG_PRINTF("should only match at end of data\n"); + assert(ri->fail_jump); // must progress + pc += ri->fail_jump; + PROGRAM_NEXT_INSTRUCTION_JUMP + } + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(CHECK_BOUNDS) { + if (!roseCheckBounds(end, ri->min_bound, ri->max_bound)) { + DEBUG_PRINTF("failed bounds check\n"); + assert(ri->fail_jump); // must progress + pc += ri->fail_jump; + PROGRAM_NEXT_INSTRUCTION_JUMP + } + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(CHECK_NOT_HANDLED) { + struct fatbit *handled = scratch->handled_roles; + if (fatbit_set(handled, t->handledKeyCount, ri->key)) { + DEBUG_PRINTF("key %u already set\n", ri->key); + assert(ri->fail_jump); // must progress + pc += ri->fail_jump; + PROGRAM_NEXT_INSTRUCTION_JUMP + } + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(CHECK_SINGLE_LOOKAROUND) { + if (!roseCheckSingleLookaround(t, scratch, ri->offset, + ri->reach_index, end)) { + DEBUG_PRINTF("failed lookaround check\n"); + assert(ri->fail_jump); // must progress + pc += ri->fail_jump; + PROGRAM_NEXT_INSTRUCTION_JUMP + } + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(CHECK_LOOKAROUND) { + if (!roseCheckLookaround(t, scratch, ri->look_index, + ri->reach_index, ri->count, end)) { + DEBUG_PRINTF("failed lookaround check\n"); + assert(ri->fail_jump); // must progress + pc += ri->fail_jump; + PROGRAM_NEXT_INSTRUCTION_JUMP + } + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(CHECK_MASK) { + struct core_info *ci = &scratch->core_info; + if (!roseCheckMask(ci, ri->and_mask, ri->cmp_mask, + ri->neg_mask, ri->offset, end)) { + DEBUG_PRINTF("failed mask check\n"); + assert(ri->fail_jump); // must progress + pc += ri->fail_jump; + PROGRAM_NEXT_INSTRUCTION_JUMP + } + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(CHECK_MASK_32) { + struct core_info *ci = &scratch->core_info; + if (!roseCheckMask32(ci, ri->and_mask, ri->cmp_mask, + ri->neg_mask, ri->offset, end)) { + assert(ri->fail_jump); + pc += ri->fail_jump; + PROGRAM_NEXT_INSTRUCTION_JUMP + } + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(CHECK_BYTE) { + const struct core_info *ci = &scratch->core_info; + if (!roseCheckByte(ci, ri->and_mask, ri->cmp_mask, + ri->negation, ri->offset, end)) { + DEBUG_PRINTF("failed byte check\n"); + assert(ri->fail_jump); // must progress + pc += ri->fail_jump; + PROGRAM_NEXT_INSTRUCTION_JUMP + } + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(CHECK_SHUFTI_16x8) { + const struct core_info *ci = &scratch->core_info; + if (!roseCheckShufti16x8(ci, ri->nib_mask, + ri->bucket_select_mask, + ri->neg_mask, ri->offset, end)) { + assert(ri->fail_jump); + pc += ri-> fail_jump; + PROGRAM_NEXT_INSTRUCTION_JUMP + } + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(CHECK_SHUFTI_32x8) { + const struct core_info *ci = &scratch->core_info; + if (!roseCheckShufti32x8(ci, ri->hi_mask, ri->lo_mask, + ri->bucket_select_mask, + ri->neg_mask, ri->offset, end)) { + assert(ri->fail_jump); + pc += ri-> fail_jump; + PROGRAM_NEXT_INSTRUCTION_JUMP + } + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(CHECK_SHUFTI_16x16) { + const struct core_info *ci = &scratch->core_info; + if (!roseCheckShufti16x16(ci, ri->hi_mask, ri->lo_mask, + ri->bucket_select_mask, + ri->neg_mask, ri->offset, end)) { + assert(ri->fail_jump); + pc += ri-> fail_jump; + PROGRAM_NEXT_INSTRUCTION_JUMP + } + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(CHECK_SHUFTI_32x16) { + const struct core_info *ci = &scratch->core_info; + if (!roseCheckShufti32x16(ci, ri->hi_mask, ri->lo_mask, + ri->bucket_select_mask_hi, + ri->bucket_select_mask_lo, + ri->neg_mask, ri->offset, end)) { + assert(ri->fail_jump); + pc += ri-> fail_jump; + PROGRAM_NEXT_INSTRUCTION_JUMP + } + } + PROGRAM_NEXT_INSTRUCTION + +#ifdef HAVE_AVX512 + PROGRAM_CASE(CHECK_MASK_64) { + struct core_info *ci = &scratch->core_info; + if (!roseCheckMask64(ci, ri->and_mask, ri->cmp_mask, + ri->neg_mask, ri->offset, end)) { + assert(ri->fail_jump); + pc += ri->fail_jump; + PROGRAM_NEXT_INSTRUCTION_JUMP + } + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(CHECK_SHUFTI_64x8) { + const struct core_info *ci = &scratch->core_info; + if (!roseCheckShufti64x8(ci, ri->hi_mask, ri->lo_mask, + ri->bucket_select_mask, + ri->neg_mask, ri->offset, end)) { + assert(ri->fail_jump); + pc += ri->fail_jump; + PROGRAM_NEXT_INSTRUCTION_JUMP; + } + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(CHECK_SHUFTI_64x16) { + const struct core_info *ci = &scratch->core_info; + if (!roseCheckShufti64x16(ci, ri->hi_mask_1, ri->hi_mask_2, + ri->lo_mask_1, ri->lo_mask_2, + ri->bucket_select_mask_hi, + ri->bucket_select_mask_lo, + ri->neg_mask, ri->offset, end)) { + assert(ri->fail_jump); + pc += ri->fail_jump; + PROGRAM_NEXT_INSTRUCTION_JUMP; + } + } + PROGRAM_NEXT_INSTRUCTION +#endif + + PROGRAM_CASE(CHECK_INFIX) { + if (!roseTestInfix(t, scratch, ri->queue, ri->lag, ri->report, + end)) { + DEBUG_PRINTF("failed infix check\n"); + assert(ri->fail_jump); // must progress + pc += ri->fail_jump; + PROGRAM_NEXT_INSTRUCTION_JUMP + } + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(CHECK_PREFIX) { + if (!roseTestPrefix(t, scratch, ri->queue, ri->lag, ri->report, + end)) { + DEBUG_PRINTF("failed prefix check\n"); + assert(ri->fail_jump); // must progress + pc += ri->fail_jump; + PROGRAM_NEXT_INSTRUCTION_JUMP + } + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(PUSH_DELAYED) { + rosePushDelayedMatch(t, scratch, ri->delay, ri->index, end); + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(DUMMY_NOP) { + assert(0); + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(CATCH_UP) { + if (roseCatchUpTo(t, scratch, end) == HWLM_TERMINATE_MATCHING) { + return HWLM_TERMINATE_MATCHING; + } + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(CATCH_UP_MPV) { + if (from_mpv || skip_mpv_catchup) { + DEBUG_PRINTF("skipping mpv catchup\n"); + } else if (roseCatchUpMPV(t, + end - scratch->core_info.buf_offset, + scratch) == HWLM_TERMINATE_MATCHING) { + return HWLM_TERMINATE_MATCHING; + } + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(SOM_ADJUST) { + assert(ri->distance <= end); + som = end - ri->distance; + DEBUG_PRINTF("som is (end - %u) = %llu\n", ri->distance, som); + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(SOM_LEFTFIX) { + som = roseGetHaigSom(t, scratch, ri->queue, ri->lag); + DEBUG_PRINTF("som from leftfix is %llu\n", som); + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(SOM_FROM_REPORT) { + som = handleSomExternal(scratch, &ri->som, end); + DEBUG_PRINTF("som from report %u is %llu\n", ri->som.onmatch, + som); + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(SOM_ZERO) { + DEBUG_PRINTF("setting SOM to zero\n"); + som = 0; + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(TRIGGER_INFIX) { + roseTriggerInfix(t, scratch, som, end, ri->queue, ri->event, + ri->cancel); + work_done = 1; + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(TRIGGER_SUFFIX) { + if (roseTriggerSuffix(t, scratch, ri->queue, ri->event, som, + end) == HWLM_TERMINATE_MATCHING) { + return HWLM_TERMINATE_MATCHING; + } + work_done = 1; + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(DEDUPE) { + updateSeqPoint(tctxt, end, from_mpv); + const char do_som = t->hasSom; // TODO: constant propagate + const char is_external_report = 1; + enum DedupeResult rv = + dedupeCatchup(t, scratch, end, som, end + ri->offset_adjust, + ri->dkey, ri->offset_adjust, + is_external_report, ri->quash_som, do_som); + switch (rv) { + case DEDUPE_HALT: + return HWLM_TERMINATE_MATCHING; + case DEDUPE_SKIP: + assert(ri->fail_jump); // must progress + pc += ri->fail_jump; + PROGRAM_NEXT_INSTRUCTION_JUMP + case DEDUPE_CONTINUE: + break; + } + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(DEDUPE_SOM) { + updateSeqPoint(tctxt, end, from_mpv); + const char is_external_report = 0; + const char do_som = 1; + enum DedupeResult rv = + dedupeCatchup(t, scratch, end, som, end + ri->offset_adjust, + ri->dkey, ri->offset_adjust, + is_external_report, ri->quash_som, do_som); + switch (rv) { + case DEDUPE_HALT: + return HWLM_TERMINATE_MATCHING; + case DEDUPE_SKIP: + assert(ri->fail_jump); // must progress + pc += ri->fail_jump; + PROGRAM_NEXT_INSTRUCTION_JUMP + case DEDUPE_CONTINUE: + break; + } + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(REPORT_CHAIN) { + // Note: sequence points updated inside this function. + if (roseCatchUpAndHandleChainMatch( + t, scratch, ri->event, ri->top_squash_distance, end, + in_catchup) == HWLM_TERMINATE_MATCHING) { + return HWLM_TERMINATE_MATCHING; + } + work_done = 1; + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(REPORT_SOM_INT) { + updateSeqPoint(tctxt, end, from_mpv); + roseHandleSom(scratch, &ri->som, end); + work_done = 1; + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(REPORT_SOM_AWARE) { + updateSeqPoint(tctxt, end, from_mpv); + roseHandleSomSom(scratch, &ri->som, som, end); + work_done = 1; + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(REPORT) { + updateSeqPoint(tctxt, end, from_mpv); + if (roseReport(t, scratch, end, ri->onmatch, ri->offset_adjust, + INVALID_EKEY) == HWLM_TERMINATE_MATCHING) { + return HWLM_TERMINATE_MATCHING; + } + work_done = 1; + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(REPORT_EXHAUST) { + updateSeqPoint(tctxt, end, from_mpv); + if (roseReport(t, scratch, end, ri->onmatch, ri->offset_adjust, + ri->ekey) == HWLM_TERMINATE_MATCHING) { + return HWLM_TERMINATE_MATCHING; + } + work_done = 1; + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(REPORT_SOM) { + updateSeqPoint(tctxt, end, from_mpv); + if (roseReportSom(t, scratch, som, end, ri->onmatch, + ri->offset_adjust, + INVALID_EKEY) == HWLM_TERMINATE_MATCHING) { + return HWLM_TERMINATE_MATCHING; + } + work_done = 1; + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(REPORT_SOM_EXHAUST) { + updateSeqPoint(tctxt, end, from_mpv); + if (roseReportSom(t, scratch, som, end, ri->onmatch, + ri->offset_adjust, + ri->ekey) == HWLM_TERMINATE_MATCHING) { + return HWLM_TERMINATE_MATCHING; + } + work_done = 1; + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(DEDUPE_AND_REPORT) { + updateSeqPoint(tctxt, end, from_mpv); + const char do_som = t->hasSom; // TODO: constant propagate + const char is_external_report = 1; + enum DedupeResult rv = + dedupeCatchup(t, scratch, end, som, end + ri->offset_adjust, + ri->dkey, ri->offset_adjust, + is_external_report, ri->quash_som, do_som); + switch (rv) { + case DEDUPE_HALT: + return HWLM_TERMINATE_MATCHING; + case DEDUPE_SKIP: + assert(ri->fail_jump); // must progress + pc += ri->fail_jump; + PROGRAM_NEXT_INSTRUCTION_JUMP + case DEDUPE_CONTINUE: + break; + } + + const u32 ekey = INVALID_EKEY; + if (roseReport(t, scratch, end, ri->onmatch, ri->offset_adjust, + ekey) == HWLM_TERMINATE_MATCHING) { + return HWLM_TERMINATE_MATCHING; + } + work_done = 1; + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(FINAL_REPORT) { + updateSeqPoint(tctxt, end, from_mpv); + if (roseReport(t, scratch, end, ri->onmatch, ri->offset_adjust, + INVALID_EKEY) == HWLM_TERMINATE_MATCHING) { + return HWLM_TERMINATE_MATCHING; + } + /* One-shot specialisation: this instruction always terminates + * execution of the program. */ + return HWLM_CONTINUE_MATCHING; + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(CHECK_EXHAUSTED) { + DEBUG_PRINTF("check ekey %u\n", ri->ekey); + assert(ri->ekey != INVALID_EKEY); + assert(ri->ekey < t->ekeyCount); + const char *evec = scratch->core_info.exhaustionVector; + if (isExhausted(t, evec, ri->ekey)) { + DEBUG_PRINTF("ekey %u already set, match is exhausted\n", + ri->ekey); + assert(ri->fail_jump); // must progress + pc += ri->fail_jump; + PROGRAM_NEXT_INSTRUCTION_JUMP + } + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(CHECK_MIN_LENGTH) { + DEBUG_PRINTF("check min length %llu (adj %d)\n", ri->min_length, + ri->end_adj); + assert(ri->min_length > 0); + assert(ri->end_adj == 0 || ri->end_adj == -1); + assert(som == HS_OFFSET_PAST_HORIZON || som <= end); + if (som != HS_OFFSET_PAST_HORIZON && + ((end + ri->end_adj) - som < ri->min_length)) { + DEBUG_PRINTF("failed check, match len %llu\n", + (u64a)((end + ri->end_adj) - som)); + assert(ri->fail_jump); // must progress + pc += ri->fail_jump; + PROGRAM_NEXT_INSTRUCTION_JUMP + } + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(SET_STATE) { + DEBUG_PRINTF("set state index %u\n", ri->index); + mmbit_set(getRoleState(scratch->core_info.state), + t->rolesWithStateCount, ri->index); + work_done = 1; + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(SET_GROUPS) { + tctxt->groups |= ri->groups; + DEBUG_PRINTF("set groups 0x%llx -> 0x%llx\n", ri->groups, + tctxt->groups); + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(SQUASH_GROUPS) { + assert(popcount64(ri->groups) == 63); // Squash only one group. + if (work_done) { + tctxt->groups &= ri->groups; + DEBUG_PRINTF("squash groups 0x%llx -> 0x%llx\n", ri->groups, + tctxt->groups); + } + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(CHECK_STATE) { + DEBUG_PRINTF("check state %u\n", ri->index); + const u8 *roles = getRoleState(scratch->core_info.state); + if (!mmbit_isset(roles, t->rolesWithStateCount, ri->index)) { + DEBUG_PRINTF("state not on\n"); + assert(ri->fail_jump); // must progress + pc += ri->fail_jump; + PROGRAM_NEXT_INSTRUCTION_JUMP + } + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(SPARSE_ITER_BEGIN) { + DEBUG_PRINTF("iter_offset=%u\n", ri->iter_offset); + const struct mmbit_sparse_iter *it = + getByOffset(t, ri->iter_offset); + assert(ISALIGNED(it)); + + const u8 *roles = getRoleState(scratch->core_info.state); + + u32 idx = 0; + u32 i = mmbit_sparse_iter_begin(roles, t->rolesWithStateCount, + &idx, it, si_state); + if (i == MMB_INVALID) { + DEBUG_PRINTF("no states in sparse iter are on\n"); + assert(ri->fail_jump); // must progress + pc += ri->fail_jump; + PROGRAM_NEXT_INSTRUCTION_JUMP + } + + fatbit_clear(scratch->handled_roles); + + const u32 *jumps = getByOffset(t, ri->jump_table); + DEBUG_PRINTF("state %u (idx=%u) is on, jump to %u\n", i, idx, + jumps[idx]); + pc = pc_base + jumps[idx]; + PROGRAM_NEXT_INSTRUCTION_JUMP + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(SPARSE_ITER_NEXT) { + DEBUG_PRINTF("iter_offset=%u, state=%u\n", ri->iter_offset, + ri->state); + const struct mmbit_sparse_iter *it = + getByOffset(t, ri->iter_offset); + assert(ISALIGNED(it)); + + const u8 *roles = getRoleState(scratch->core_info.state); + + u32 idx = 0; + u32 i = mmbit_sparse_iter_next(roles, t->rolesWithStateCount, + ri->state, &idx, it, si_state); + if (i == MMB_INVALID) { + DEBUG_PRINTF("no more states in sparse iter are on\n"); + assert(ri->fail_jump); // must progress + pc += ri->fail_jump; + PROGRAM_NEXT_INSTRUCTION_JUMP + } + + const u32 *jumps = getByOffset(t, ri->jump_table); + DEBUG_PRINTF("state %u (idx=%u) is on, jump to %u\n", i, idx, + jumps[idx]); + pc = pc_base + jumps[idx]; + PROGRAM_NEXT_INSTRUCTION_JUMP + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(SPARSE_ITER_ANY) { + DEBUG_PRINTF("iter_offset=%u\n", ri->iter_offset); + const struct mmbit_sparse_iter *it = + getByOffset(t, ri->iter_offset); + assert(ISALIGNED(it)); + + const u8 *roles = getRoleState(scratch->core_info.state); + + u32 idx = 0; + u32 i = mmbit_sparse_iter_begin(roles, t->rolesWithStateCount, + &idx, it, si_state); + if (i == MMB_INVALID) { + DEBUG_PRINTF("no states in sparse iter are on\n"); + assert(ri->fail_jump); // must progress + pc += ri->fail_jump; + PROGRAM_NEXT_INSTRUCTION_JUMP + } + DEBUG_PRINTF("state %u (idx=%u) is on\n", i, idx); + fatbit_clear(scratch->handled_roles); + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(ENGINES_EOD) { + if (roseEnginesEod(t, scratch, end, ri->iter_offset) == + HWLM_TERMINATE_MATCHING) { + return HWLM_TERMINATE_MATCHING; + } + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(SUFFIXES_EOD) { + if (roseSuffixesEod(t, scratch, end) == + HWLM_TERMINATE_MATCHING) { + return HWLM_TERMINATE_MATCHING; + } + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(MATCHER_EOD) { + if (roseMatcherEod(t, scratch, end) == + HWLM_TERMINATE_MATCHING) { + return HWLM_TERMINATE_MATCHING; + } + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(CHECK_LONG_LIT) { + const char nocase = 0; + if (!roseCheckLongLiteral(t, scratch, end, ri->lit_offset, + ri->lit_length, nocase)) { + DEBUG_PRINTF("failed long lit check\n"); + assert(ri->fail_jump); // must progress + pc += ri->fail_jump; + PROGRAM_NEXT_INSTRUCTION_JUMP + } + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(CHECK_LONG_LIT_NOCASE) { + const char nocase = 1; + if (!roseCheckLongLiteral(t, scratch, end, ri->lit_offset, + ri->lit_length, nocase)) { + DEBUG_PRINTF("failed nocase long lit check\n"); + assert(ri->fail_jump); // must progress + pc += ri->fail_jump; + PROGRAM_NEXT_INSTRUCTION_JUMP + } + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(CHECK_MED_LIT) { + const char nocase = 0; + if (!roseCheckMediumLiteral(t, scratch, end, ri->lit_offset, + ri->lit_length, nocase)) { + DEBUG_PRINTF("failed lit check\n"); + assert(ri->fail_jump); // must progress + pc += ri->fail_jump; + PROGRAM_NEXT_INSTRUCTION_JUMP + } + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(CHECK_MED_LIT_NOCASE) { + const char nocase = 1; + if (!roseCheckMediumLiteral(t, scratch, end, ri->lit_offset, + ri->lit_length, nocase)) { + DEBUG_PRINTF("failed long lit check\n"); + assert(ri->fail_jump); // must progress + pc += ri->fail_jump; + PROGRAM_NEXT_INSTRUCTION_JUMP + } + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(CLEAR_WORK_DONE) { + DEBUG_PRINTF("clear work_done flag\n"); + work_done = 0; + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(MULTIPATH_LOOKAROUND) { + if (!roseMultipathLookaround(t, scratch, ri->look_index, + ri->reach_index, ri->count, + ri->last_start, ri->start_mask, + end)) { + DEBUG_PRINTF("failed multi-path lookaround check\n"); + assert(ri->fail_jump); // must progress + pc += ri->fail_jump; + PROGRAM_NEXT_INSTRUCTION_JUMP + } + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(CHECK_MULTIPATH_SHUFTI_16x8) { + if (!roseCheckMultipathShufti16x8(scratch, ri, end)) { + DEBUG_PRINTF("failed multi-path shufti 16x8 check\n"); + assert(ri->fail_jump); // must progress + pc += ri->fail_jump; + PROGRAM_NEXT_INSTRUCTION_JUMP + } + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(CHECK_MULTIPATH_SHUFTI_32x8) { + if (!roseCheckMultipathShufti32x8(scratch, ri, end)) { + DEBUG_PRINTF("failed multi-path shufti 32x8 check\n"); + assert(ri->fail_jump); // must progress + pc += ri->fail_jump; + PROGRAM_NEXT_INSTRUCTION_JUMP + } + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(CHECK_MULTIPATH_SHUFTI_32x16) { + if (!roseCheckMultipathShufti32x16(scratch, ri, end)) { + DEBUG_PRINTF("failed multi-path shufti 32x16 check\n"); + assert(ri->fail_jump); // must progress + pc += ri->fail_jump; + PROGRAM_NEXT_INSTRUCTION_JUMP + } + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(CHECK_MULTIPATH_SHUFTI_64) { + if (!roseCheckMultipathShufti64(scratch, ri, end)) { + DEBUG_PRINTF("failed multi-path shufti 64 check\n"); + assert(ri->fail_jump); // must progress + pc += ri->fail_jump; + PROGRAM_NEXT_INSTRUCTION_JUMP + } + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(INCLUDED_JUMP) { + if (scratch->fdr_conf) { + // squash the bucket of included literal + u8 shift = scratch->fdr_conf_offset & ~7U; + u64a mask = ((~(u64a)ri->squash) << shift); + *(scratch->fdr_conf) &= mask; + + pc = getByOffset(t, ri->child_offset); + pc_base = pc; + programOffset = (const u8 *)pc_base -(const u8 *)t; + DEBUG_PRINTF("pc_base %p pc %p child_offset %u squash %u\n", + pc_base, pc, ri->child_offset, ri->squash); + work_done = 0; + PROGRAM_NEXT_INSTRUCTION_JUMP + } + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(SET_LOGICAL) { + DEBUG_PRINTF("set logical value of lkey %u, offset_adjust=%d\n", + ri->lkey, ri->offset_adjust); + assert(ri->lkey != INVALID_LKEY); + assert(ri->lkey < t->lkeyCount); + char *lvec = scratch->core_info.logicalVector; + setLogicalVal(t, lvec, ri->lkey, 1); + updateLastCombMatchOffset(tctxt, end + ri->offset_adjust); + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(SET_COMBINATION) { + DEBUG_PRINTF("set ckey %u as active\n", ri->ckey); + assert(ri->ckey != INVALID_CKEY); + assert(ri->ckey < t->ckeyCount); + char *cvec = scratch->core_info.combVector; + setCombinationActive(t, cvec, ri->ckey); + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(FLUSH_COMBINATION) { + assert(end >= tctxt->lastCombMatchOffset); + if (end > tctxt->lastCombMatchOffset) { + if (flushActiveCombinations(t, scratch) + == HWLM_TERMINATE_MATCHING) { + return HWLM_TERMINATE_MATCHING; + } + } + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(SET_EXHAUST) { + updateSeqPoint(tctxt, end, from_mpv); + if (roseSetExhaust(t, scratch, ri->ekey) + == HWLM_TERMINATE_MATCHING) { + return HWLM_TERMINATE_MATCHING; + } + work_done = 1; + } + PROGRAM_NEXT_INSTRUCTION + + PROGRAM_CASE(LAST_FLUSH_COMBINATION) { + assert(end >= tctxt->lastCombMatchOffset); + if (flushActiveCombinations(t, scratch) + == HWLM_TERMINATE_MATCHING) { + return HWLM_TERMINATE_MATCHING; + } + if (checkPurelyNegatives(t, scratch, end) + == HWLM_TERMINATE_MATCHING) { + return HWLM_TERMINATE_MATCHING; + } + } + PROGRAM_NEXT_INSTRUCTION + + default: { + assert(0); // unreachable + scratch->core_info.status |= STATUS_ERROR; + return HWLM_TERMINATE_MATCHING; + } + } + } + + assert(0); // unreachable + return HWLM_CONTINUE_MATCHING; +} + +#define L_PROGRAM_CASE(name) \ + case ROSE_INSTR_##name: { \ + DEBUG_PRINTF("l_instruction: " #name " (pc=%u)\n", \ + programOffset + (u32)(pc - pc_base)); \ + const struct ROSE_STRUCT_##name *ri = \ + (const struct ROSE_STRUCT_##name *)pc; + +#define L_PROGRAM_NEXT_INSTRUCTION \ + pc += ROUNDUP_N(sizeof(*ri), ROSE_INSTR_MIN_ALIGN); \ + break; \ + } + +#define L_PROGRAM_NEXT_INSTRUCTION_JUMP continue; + +hwlmcb_rv_t roseRunProgram_l(const struct RoseEngine *t, + struct hs_scratch *scratch, u32 programOffset, + u64a som, u64a end, u8 prog_flags) { + DEBUG_PRINTF("program=%u, offsets [%llu,%llu], flags=%u\n", programOffset, + som, end, prog_flags); + + assert(programOffset != ROSE_INVALID_PROG_OFFSET); + assert(programOffset >= sizeof(struct RoseEngine)); + assert(programOffset < t->size); + + const char in_catchup = prog_flags & ROSE_PROG_FLAG_IN_CATCHUP; + const char from_mpv = prog_flags & ROSE_PROG_FLAG_FROM_MPV; + + const char *pc_base = getByOffset(t, programOffset); + const char *pc = pc_base; + + // If this program has an effect, work_done will be set to one (which may + // allow the program to squash groups). + int work_done = 0; + + struct RoseContext *tctxt = &scratch->tctxt; + + assert(*(const u8 *)pc != ROSE_INSTR_END); + + for (;;) { + assert(ISALIGNED_N(pc, ROSE_INSTR_MIN_ALIGN)); + assert(pc >= pc_base); + assert((size_t)(pc - pc_base) < t->size); + const u8 code = *(const u8 *)pc; + assert(code <= LAST_ROSE_INSTRUCTION); + + switch ((enum RoseInstructionCode)code) { + L_PROGRAM_CASE(END) { + DEBUG_PRINTF("finished\n"); + return HWLM_CONTINUE_MATCHING; + } + L_PROGRAM_NEXT_INSTRUCTION + + L_PROGRAM_CASE(CHECK_GROUPS) { + DEBUG_PRINTF("groups=0x%llx, checking instr groups=0x%llx\n", + tctxt->groups, ri->groups); + if (!(ri->groups & tctxt->groups)) { + DEBUG_PRINTF("halt: no groups are set\n"); + return HWLM_CONTINUE_MATCHING; + } + } + L_PROGRAM_NEXT_INSTRUCTION + + L_PROGRAM_CASE(CHECK_MASK) { + struct core_info *ci = &scratch->core_info; + if (!roseCheckMask(ci, ri->and_mask, ri->cmp_mask, + ri->neg_mask, ri->offset, end)) { + DEBUG_PRINTF("failed mask check\n"); + assert(ri->fail_jump); // must progress + pc += ri->fail_jump; + L_PROGRAM_NEXT_INSTRUCTION_JUMP + } + } + L_PROGRAM_NEXT_INSTRUCTION + + L_PROGRAM_CASE(CHECK_MASK_32) { + struct core_info *ci = &scratch->core_info; + if (!roseCheckMask32(ci, ri->and_mask, ri->cmp_mask, + ri->neg_mask, ri->offset, end)) { + assert(ri->fail_jump); + pc += ri->fail_jump; + L_PROGRAM_NEXT_INSTRUCTION_JUMP + } + } + L_PROGRAM_NEXT_INSTRUCTION + +#ifdef HAVE_AVX512 + L_PROGRAM_CASE(CHECK_MASK_64) { + struct core_info *ci = &scratch->core_info; + if (!roseCheckMask64(ci, ri->and_mask, ri->cmp_mask, + ri->neg_mask, ri->offset, end)) { + assert(ri->fail_jump); + pc += ri->fail_jump; + L_PROGRAM_NEXT_INSTRUCTION_JUMP + } + } + L_PROGRAM_NEXT_INSTRUCTION +#endif + + L_PROGRAM_CASE(CHECK_BYTE) { + const struct core_info *ci = &scratch->core_info; + if (!roseCheckByte(ci, ri->and_mask, ri->cmp_mask, + ri->negation, ri->offset, end)) { + DEBUG_PRINTF("failed byte check\n"); + assert(ri->fail_jump); // must progress + pc += ri->fail_jump; + L_PROGRAM_NEXT_INSTRUCTION_JUMP + } + } + L_PROGRAM_NEXT_INSTRUCTION + + L_PROGRAM_CASE(PUSH_DELAYED) { + rosePushDelayedMatch(t, scratch, ri->delay, ri->index, end); + } + L_PROGRAM_NEXT_INSTRUCTION + + L_PROGRAM_CASE(CATCH_UP) { + if (roseCatchUpTo(t, scratch, end) == HWLM_TERMINATE_MATCHING) { + return HWLM_TERMINATE_MATCHING; + } + } + L_PROGRAM_NEXT_INSTRUCTION + + L_PROGRAM_CASE(SOM_FROM_REPORT) { + som = handleSomExternal(scratch, &ri->som, end); + DEBUG_PRINTF("som from report %u is %llu\n", ri->som.onmatch, + som); + } + L_PROGRAM_NEXT_INSTRUCTION + + L_PROGRAM_CASE(DEDUPE) { + updateSeqPoint(tctxt, end, from_mpv); + const char do_som = t->hasSom; // TODO: constant propagate + const char is_external_report = 1; + enum DedupeResult rv = + dedupeCatchup(t, scratch, end, som, end + ri->offset_adjust, + ri->dkey, ri->offset_adjust, + is_external_report, ri->quash_som, do_som); + switch (rv) { + case DEDUPE_HALT: + return HWLM_TERMINATE_MATCHING; + case DEDUPE_SKIP: + assert(ri->fail_jump); // must progress + pc += ri->fail_jump; + L_PROGRAM_NEXT_INSTRUCTION_JUMP + case DEDUPE_CONTINUE: + break; + } + } + L_PROGRAM_NEXT_INSTRUCTION + + L_PROGRAM_CASE(DEDUPE_SOM) { + updateSeqPoint(tctxt, end, from_mpv); + const char is_external_report = 0; + const char do_som = 1; + enum DedupeResult rv = + dedupeCatchup(t, scratch, end, som, end + ri->offset_adjust, + ri->dkey, ri->offset_adjust, + is_external_report, ri->quash_som, do_som); + switch (rv) { + case DEDUPE_HALT: + return HWLM_TERMINATE_MATCHING; + case DEDUPE_SKIP: + assert(ri->fail_jump); // must progress + pc += ri->fail_jump; + L_PROGRAM_NEXT_INSTRUCTION_JUMP + case DEDUPE_CONTINUE: + break; + } + } + L_PROGRAM_NEXT_INSTRUCTION + + L_PROGRAM_CASE(REPORT_CHAIN) { + // Note: sequence points updated inside this function. + if (roseCatchUpAndHandleChainMatch( + t, scratch, ri->event, ri->top_squash_distance, end, + in_catchup) == HWLM_TERMINATE_MATCHING) { + return HWLM_TERMINATE_MATCHING; + } + work_done = 1; + } + L_PROGRAM_NEXT_INSTRUCTION + + L_PROGRAM_CASE(REPORT) { + updateSeqPoint(tctxt, end, from_mpv); + if (roseReport(t, scratch, end, ri->onmatch, ri->offset_adjust, + INVALID_EKEY) == HWLM_TERMINATE_MATCHING) { + return HWLM_TERMINATE_MATCHING; + } + work_done = 1; + } + L_PROGRAM_NEXT_INSTRUCTION + + L_PROGRAM_CASE(REPORT_EXHAUST) { + updateSeqPoint(tctxt, end, from_mpv); + if (roseReport(t, scratch, end, ri->onmatch, ri->offset_adjust, + ri->ekey) == HWLM_TERMINATE_MATCHING) { + return HWLM_TERMINATE_MATCHING; + } + work_done = 1; + } + L_PROGRAM_NEXT_INSTRUCTION + + L_PROGRAM_CASE(REPORT_SOM) { + updateSeqPoint(tctxt, end, from_mpv); + if (roseReportSom(t, scratch, som, end, ri->onmatch, + ri->offset_adjust, + INVALID_EKEY) == HWLM_TERMINATE_MATCHING) { + return HWLM_TERMINATE_MATCHING; + } + work_done = 1; + } + L_PROGRAM_NEXT_INSTRUCTION + + L_PROGRAM_CASE(DEDUPE_AND_REPORT) { + updateSeqPoint(tctxt, end, from_mpv); + const char do_som = t->hasSom; // TODO: constant propagate + const char is_external_report = 1; + enum DedupeResult rv = + dedupeCatchup(t, scratch, end, som, end + ri->offset_adjust, + ri->dkey, ri->offset_adjust, + is_external_report, ri->quash_som, do_som); + switch (rv) { + case DEDUPE_HALT: + return HWLM_TERMINATE_MATCHING; + case DEDUPE_SKIP: + assert(ri->fail_jump); // must progress + pc += ri->fail_jump; + L_PROGRAM_NEXT_INSTRUCTION_JUMP + case DEDUPE_CONTINUE: + break; + } + + const u32 ekey = INVALID_EKEY; + if (roseReport(t, scratch, end, ri->onmatch, ri->offset_adjust, + ekey) == HWLM_TERMINATE_MATCHING) { + return HWLM_TERMINATE_MATCHING; + } + work_done = 1; + } + L_PROGRAM_NEXT_INSTRUCTION + + L_PROGRAM_CASE(FINAL_REPORT) { + updateSeqPoint(tctxt, end, from_mpv); + if (roseReport(t, scratch, end, ri->onmatch, ri->offset_adjust, + INVALID_EKEY) == HWLM_TERMINATE_MATCHING) { + return HWLM_TERMINATE_MATCHING; + } + /* One-shot specialisation: this instruction always terminates + * execution of the program. */ + return HWLM_CONTINUE_MATCHING; + } + L_PROGRAM_NEXT_INSTRUCTION + + L_PROGRAM_CASE(CHECK_EXHAUSTED) { + DEBUG_PRINTF("check ekey %u\n", ri->ekey); + assert(ri->ekey != INVALID_EKEY); + assert(ri->ekey < t->ekeyCount); + const char *evec = scratch->core_info.exhaustionVector; + if (isExhausted(t, evec, ri->ekey)) { + DEBUG_PRINTF("ekey %u already set, match is exhausted\n", + ri->ekey); + assert(ri->fail_jump); // must progress + pc += ri->fail_jump; + L_PROGRAM_NEXT_INSTRUCTION_JUMP + } + } + L_PROGRAM_NEXT_INSTRUCTION + + L_PROGRAM_CASE(SQUASH_GROUPS) { + assert(popcount64(ri->groups) == 63); // Squash only one group. + if (work_done) { + tctxt->groups &= ri->groups; + DEBUG_PRINTF("squash groups 0x%llx -> 0x%llx\n", ri->groups, + tctxt->groups); + } + } + L_PROGRAM_NEXT_INSTRUCTION + + L_PROGRAM_CASE(CHECK_LONG_LIT) { + const char nocase = 0; + if (!roseCheckLongLiteral(t, scratch, end, ri->lit_offset, + ri->lit_length, nocase)) { + DEBUG_PRINTF("failed long lit check\n"); + assert(ri->fail_jump); // must progress + pc += ri->fail_jump; + L_PROGRAM_NEXT_INSTRUCTION_JUMP + } + } + L_PROGRAM_NEXT_INSTRUCTION + + L_PROGRAM_CASE(CHECK_LONG_LIT_NOCASE) { + const char nocase = 1; + if (!roseCheckLongLiteral(t, scratch, end, ri->lit_offset, + ri->lit_length, nocase)) { + DEBUG_PRINTF("failed nocase long lit check\n"); + assert(ri->fail_jump); // must progress + pc += ri->fail_jump; + L_PROGRAM_NEXT_INSTRUCTION_JUMP + } + } + L_PROGRAM_NEXT_INSTRUCTION + + L_PROGRAM_CASE(CHECK_MED_LIT) { + const char nocase = 0; + if (!roseCheckMediumLiteral(t, scratch, end, ri->lit_offset, + ri->lit_length, nocase)) { + DEBUG_PRINTF("failed lit check\n"); + assert(ri->fail_jump); // must progress + pc += ri->fail_jump; + L_PROGRAM_NEXT_INSTRUCTION_JUMP + } + } + L_PROGRAM_NEXT_INSTRUCTION + + L_PROGRAM_CASE(CHECK_MED_LIT_NOCASE) { + const char nocase = 1; + if (!roseCheckMediumLiteral(t, scratch, end, ri->lit_offset, + ri->lit_length, nocase)) { + DEBUG_PRINTF("failed long lit check\n"); + assert(ri->fail_jump); // must progress + pc += ri->fail_jump; + L_PROGRAM_NEXT_INSTRUCTION_JUMP + } + } + L_PROGRAM_NEXT_INSTRUCTION + + L_PROGRAM_CASE(CLEAR_WORK_DONE) { + DEBUG_PRINTF("clear work_done flag\n"); + work_done = 0; + } + L_PROGRAM_NEXT_INSTRUCTION + + L_PROGRAM_CASE(INCLUDED_JUMP) { + if (scratch->fdr_conf) { + // squash the bucket of included literal + u8 shift = scratch->fdr_conf_offset & ~7U; + u64a mask = ((~(u64a)ri->squash) << shift); + *(scratch->fdr_conf) &= mask; + + pc = getByOffset(t, ri->child_offset); + pc_base = pc; + programOffset = (const u8 *)pc_base -(const u8 *)t; + DEBUG_PRINTF("pc_base %p pc %p child_offset %u squash %u\n", + pc_base, pc, ri->child_offset, ri->squash); + work_done = 0; + L_PROGRAM_NEXT_INSTRUCTION_JUMP + } + } + L_PROGRAM_NEXT_INSTRUCTION + + L_PROGRAM_CASE(SET_LOGICAL) { + DEBUG_PRINTF("set logical value of lkey %u, offset_adjust=%d\n", + ri->lkey, ri->offset_adjust); + assert(ri->lkey != INVALID_LKEY); + assert(ri->lkey < t->lkeyCount); + char *lvec = scratch->core_info.logicalVector; + setLogicalVal(t, lvec, ri->lkey, 1); + updateLastCombMatchOffset(tctxt, end + ri->offset_adjust); + } + L_PROGRAM_NEXT_INSTRUCTION + + L_PROGRAM_CASE(SET_COMBINATION) { + DEBUG_PRINTF("set ckey %u as active\n", ri->ckey); + assert(ri->ckey != INVALID_CKEY); + assert(ri->ckey < t->ckeyCount); + char *cvec = scratch->core_info.combVector; + setCombinationActive(t, cvec, ri->ckey); + } + L_PROGRAM_NEXT_INSTRUCTION + + L_PROGRAM_CASE(FLUSH_COMBINATION) { + assert(end >= tctxt->lastCombMatchOffset); + if (end > tctxt->lastCombMatchOffset) { + if (flushActiveCombinations(t, scratch) + == HWLM_TERMINATE_MATCHING) { + return HWLM_TERMINATE_MATCHING; + } + } + } + L_PROGRAM_NEXT_INSTRUCTION + + L_PROGRAM_CASE(SET_EXHAUST) { + updateSeqPoint(tctxt, end, from_mpv); + if (roseSetExhaust(t, scratch, ri->ekey) + == HWLM_TERMINATE_MATCHING) { + return HWLM_TERMINATE_MATCHING; + } + work_done = 1; + } + L_PROGRAM_NEXT_INSTRUCTION + + L_PROGRAM_CASE(LAST_FLUSH_COMBINATION) { + assert(end >= tctxt->lastCombMatchOffset); + if (flushActiveCombinations(t, scratch) + == HWLM_TERMINATE_MATCHING) { + return HWLM_TERMINATE_MATCHING; + } + if (checkPurelyNegatives(t, scratch, end) + == HWLM_TERMINATE_MATCHING) { + return HWLM_TERMINATE_MATCHING; + } + } + L_PROGRAM_NEXT_INSTRUCTION + + default: { + assert(0); // unreachable + scratch->core_info.status |= STATUS_ERROR; + return HWLM_TERMINATE_MATCHING; + } + } + } + + assert(0); // unreachable + return HWLM_CONTINUE_MATCHING; +} + +#undef L_PROGRAM_CASE +#undef L_PROGRAM_NEXT_INSTRUCTION +#undef L_PROGRAM_NEXT_INSTRUCTION_JUMP + +#undef PROGRAM_CASE +#undef PROGRAM_NEXT_INSTRUCTION +#undef PROGRAM_NEXT_INSTRUCTION_JUMP diff --git a/regex/rose/program_runtime.h b/regex/rose/program_runtime.h new file mode 100644 index 000000000..50bf202c6 --- /dev/null +++ b/regex/rose/program_runtime.h @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2015-2019, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * \file + * \brief Rose runtime: program interpreter. + */ + +#ifndef PROGRAM_RUNTIME_H +#define PROGRAM_RUNTIME_H + +#include "hwlm/hwlm.h" // for hwlmcb_rv_t +#include "rose.h" +#include "scratch.h" +#include "ue2common.h" + +/* + * Program context flags, which control the behaviour of some instructions at + * based on runtime contexts (whether the program is triggered by the anchored + * matcher, engine catchup, etc). + */ + +#define ROSE_PROG_FLAG_IN_ANCHORED 1 +#define ROSE_PROG_FLAG_IN_CATCHUP 2 +#define ROSE_PROG_FLAG_FROM_MPV 4 +#define ROSE_PROG_FLAG_SKIP_MPV_CATCHUP 8 + +hwlmcb_rv_t roseRunProgram(const struct RoseEngine *t, + struct hs_scratch *scratch, u32 programOffset, + u64a som, u64a end, u8 prog_flags); + +hwlmcb_rv_t roseRunProgram_l(const struct RoseEngine *t, + struct hs_scratch *scratch, u32 programOffset, + u64a som, u64a end, u8 prog_flags); + +#endif // PROGRAM_RUNTIME_H diff --git a/regex/rose/rose.h b/regex/rose/rose.h new file mode 100644 index 000000000..409b70028 --- /dev/null +++ b/regex/rose/rose.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2015-2019, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef ROSE_H +#define ROSE_H + +#include "ue2common.h" + +struct RoseEngine; +struct hs_scratch; + +// Initialise state space for engine use. +void roseInitState(const struct RoseEngine *t, char *state); + +/* assumes core_info in scratch has been init to point to data */ +void roseBlockExec(const struct RoseEngine *t, struct hs_scratch *scratch); + +/* assumes core_info in scratch has been init to point to data */ +void roseStreamExec(const struct RoseEngine *t, struct hs_scratch *scratch); + +void roseStreamEodExec(const struct RoseEngine *t, u64a offset, + struct hs_scratch *scratch); + +hwlmcb_rv_t roseCallback(size_t end, u32 id, struct hs_scratch *scratch); + +int roseReportAdaptor(u64a start, u64a end, ReportID id, void *context); + +int roseRunBoundaryProgram(const struct RoseEngine *rose, u32 program, + u64a stream_offset, struct hs_scratch *scratch); + +int roseRunFlushCombProgram(const struct RoseEngine *rose, + struct hs_scratch *scratch, u64a end); + +int roseRunLastFlushCombProgram(const struct RoseEngine *rose, + struct hs_scratch *scratch, u64a end); + +#endif // ROSE_H diff --git a/regex/rose/rose_common.h b/regex/rose/rose_common.h new file mode 100644 index 000000000..34678b8fc --- /dev/null +++ b/regex/rose/rose_common.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef ROSE_COMMON_H +#define ROSE_COMMON_H + +// Common defs available to build-time clients as well as runtime. + +#define ROSE_BOUND_INF (~0U) +#define MAX_MASK2_WIDTH 32 + +// Max block width to use the combined small-block matcher on, instead of +// running the floating and anchored tables. +#define ROSE_SMALL_BLOCK_LEN 32 + +/** \brief Length in bytes of a reach bitvector, used by the lookaround code. */ +#define REACH_BITVECTOR_LEN 32 + +/** \brief Length in bytes of a reach bitvector for multi-path lookaround. */ +#define MULTI_REACH_BITVECTOR_LEN 256 + +/** + * \brief The max offset from the leftmost byte to the rightmost byte in + * multi-path lookaround. + */ +#define MULTIPATH_MAX_LEN 16 + +/** \brief Value used to represent an invalid Rose program offset. */ +#define ROSE_INVALID_PROG_OFFSET 0 + +#endif // ROSE_COMMON_H diff --git a/regex/rose/rose_internal.h b/regex/rose/rose_internal.h new file mode 100644 index 000000000..f84c7a080 --- /dev/null +++ b/regex/rose/rose_internal.h @@ -0,0 +1,659 @@ +/* + * Copyright (c) 2015-2019, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Rose data structures. + */ + +#ifndef ROSE_INTERNAL_H +#define ROSE_INTERNAL_H + +#include "ue2common.h" +#include "rose_common.h" +#include "util/scatter.h" + +#define ROSE_OFFSET_INVALID 0xffffffff + +// Group constants +typedef u64a rose_group; + +// Delayed literal stuff +#define DELAY_BITS 5 +#define DELAY_SLOT_COUNT (1U << DELAY_BITS) +#define MAX_DELAY (DELAY_SLOT_COUNT - 1) +#define DELAY_MASK (DELAY_SLOT_COUNT - 1) + +/* Allocation of Rose literal ids + * + * The rose literal id space is segmented: + * + * ---- 0 + * | | 'Normal' undelayed literals in either e or f tables + * | | + * | | + * | | + * ---- anchored_base_id + * | | literals from the a table + * | | + * ---- delay_base_id + * | | Delayed version of normal literals + * | | + * ---- literalCount + */ + +/* Rose Literal Sources + * + * Rose currently gets events (mainly roseProcessMatch calls) from a number of + * sources: + * 1) The floating table + * 2) The anchored table + * 3) Delayed literals + * 4) Suffix NFAs + * 5) Literal masks + * 5) End anchored table + * 6) Prefix / Infix nfas + * + * Care is required to ensure that events appear to come into Rose in order + * (or sufficiently ordered for Rose to cope). Generally the progress of the + * floating table is considered the canonical position in the buffer. + * + * Anchored table: + * The anchored table is run before the floating table as nothing in it can + * depend on a floating literal. Order is achieved by two measures: + * a) user matches^1 are logged and held until the floating matcher passes that + * point; + * b) any floating role with an anchored predecessor has a history relationship + * to enforce the ordering. + * + * Delayed literals: + * Delayed literal ordering is handled by delivering any pending delayed + * literals before processing any floating match. + * + * Suffix: + * Suffixes are always pure terminal roles. Prior to raising a match^2, pending + * NFA queues are run to the current point (floating or delayed literal) as + * appropriate. + * + * Literal Masks: + * These are triggered from either floating literals or delayed literals and + * inspect the data behind them. Matches are raised at the same location as the + * trigger literal so there are no ordering issues. Masks are always pure + * terminal roles. + * + * Lookaround: + * These are tests run on receipt of a role that "look around" the match, + * checking characters at nearby offsets against reachability masks. Each role + * can have a list of these lookaround offset/reach pairs, ordered in offset + * order, and any failure will prevent the role from being switched on. Offsets + * are relative to the byte after a literal match, and can be negative. + * + * Prefix / Infix: + * TODO: remember / discuss + * + * End anchored table: + * All user matches occur at the last byte. We do this last, so no problems + * (yippee) + * + * ^1 User matches which occur before any possible match from the other tables + * are not delayed. + * ^2 Queues may also be run to the current location if a queue is full and + * needs to be emptied. + * ^3 There is no need to catch up at the end of a block scan as it contains no + * terminals. + */ + +struct RoseCountingMiracle { + char shufti; /** 1: count shufti class; 0: count a single character */ + u8 count; /** minimum number of occurrences for the counting + * miracle char to kill the leftfix. */ + u8 c; /** character to look for if not shufti */ + u8 poison; /** character not in the shufti mask */ + m128 lo; /** shufti lo mask */ + m128 hi; /** shufti hi mask */ +}; + +struct LeftNfaInfo { + u32 maxQueueLen; + u32 maxLag; // maximum of successor roles' lag + u32 lagIndex; // iff lag != 0, index into leftfixLagTable + u32 stopTable; // stop table index, or ROSE_OFFSET_INVALID + u8 transient; /**< 0 if not transient, else max width of transient prefix */ + char infix; /* TODO: make flags */ + char eager; /**< nfa should be run eagerly to first match or death */ + char eod_check; /**< nfa is used by the event eod literal */ + u32 countingMiracleOffset; /** if not 0, offset to RoseCountingMiracle. */ + rose_group squash_mask; /* & mask applied when rose nfa dies */ +}; + +struct NfaInfo { + u32 nfaOffset; + u32 stateOffset; + u32 fullStateOffset; /* offset in scratch, relative to ??? */ + u32 ekeyListOffset; /* suffix, relative to base of rose, 0 if no ekeys */ + u8 no_retrigger; /* TODO */ + u8 in_sbmatcher; /**< this outfix should not be run in small-block + * execution, as it will be handled by the sbmatcher + * HWLM table. */ + u8 eod; /* suffix is triggered by the etable --> can only produce eod + * matches */ +}; + +#define MAX_STORED_LEFTFIX_LAG 127 /* max leftfix lag that we can store in one + * whole byte (OWB) (streaming only). Other + * values in OWB are reserved for zombie + * status */ +#define OWB_ZOMBIE_ALWAYS_YES 128 /* nfa will always answer yes to any rose + * prefix checks */ + +/* offset of the status flags in the stream state. */ +#define ROSE_STATE_OFFSET_STATUS_FLAGS 0 + +/* offset of role mmbit in stream state (just after the status flag byte). */ +#define ROSE_STATE_OFFSET_ROLE_MMBIT sizeof(u8) + +/** + * \brief Rose state offsets. + * + * Stores pre-calculated offsets (in bytes) to MOST of the state structures + * used by Rose, relative to the start of stream state. + * + * State not covered by this structure includes: + * + * -# the first byte, containing the status bitmask + * -# the role state multibit + */ +struct RoseStateOffsets { + /** History buffer. + * + * Max size of history is RoseEngine::historyRequired. */ + u32 history; + + /** Exhausted multibit. + * + * entry per exhaustible key (used by Highlander mode). If a bit is set, + * reports with that ekey should not be delivered to the user. */ + u32 exhausted; + + /** size in bytes of exhausted multibit */ + u32 exhausted_size; + + /** Logical multibit. + * + * entry per logical key(operand/operator) (used by Logical Combination). */ + u32 logicalVec; + + /** size in bytes of logical multibit */ + u32 logicalVec_size; + + /** Combination multibit. + * + * entry per combination key (used by Logical Combination). */ + u32 combVec; + + /** size in bytes of combination multibit */ + u32 combVec_size; + + /** Multibit for active suffix/outfix engines. */ + u32 activeLeafArray; + + /** Size of multibit for active suffix/outfix engines in bytes. */ + u32 activeLeafArray_size; + + /** Multibit for active leftfix (prefix/infix) engines. */ + u32 activeLeftArray; + + /** Size of multibit for active leftfix (prefix/infix) engines in bytes. */ + u32 activeLeftArray_size; + + /** Table of lag information (stored as one byte per engine) for active + * Rose leftfix engines. */ + u32 leftfixLagTable; + + /** State for anchored matchers (McClellan DFAs). */ + u32 anchorState; + + /** Packed Rose groups value. */ + u32 groups; + + /** Size of packed Rose groups value, in bytes. */ + u32 groups_size; + + /** State for long literal support. */ + u32 longLitState; + + /** Size of the long literal state. */ + u32 longLitState_size; + + /** Packed SOM location slots. */ + u32 somLocation; + + /** Multibit guarding SOM location slots. */ + u32 somValid; + + /** Multibit guarding SOM location slots. */ + u32 somWritable; + + /** Size of each of the somValid and somWritable multibits, in bytes. */ + u32 somMultibit_size; + + /** Begin of the region where NFA engine state is stored. + * The NFA state region extends to end. */ + u32 nfaStateBegin; + + /** Total size of Rose state, in bytes. */ + u32 end; +}; + +struct RoseBoundaryReports { + /** \brief 0 if no reports list, otherwise offset of program to run to + * deliver reports at EOD. */ + u32 reportEodOffset; + + /** \brief 0 if no reports list, otherwise offset of program to run to + * deliver reports at offset 0. */ + u32 reportZeroOffset; + + /** \brief 0 if no reports list, otherwise offset of program to run to + * deliver reports if EOD is at offset 0. Superset of other programs. */ + u32 reportZeroEodOffset; +}; + +/* NFA Queue Assignment + * + * --- 0 + * (|) chained mpv (if present) + * # + * --- outfixBeginQueue - + * | outfixes. enabled at offset 0. + * | + * # + * --- outfixEndQueue - + * | suffixes. enabled by rose roles. + * | + * # + * --- leftfixBeginQueue - + * | prefixes + * | + * # + * --- ? + * | infixes + * | + * # + */ + +#define ROSE_RUNTIME_FULL_ROSE 0 +#define ROSE_RUNTIME_PURE_LITERAL 1 +#define ROSE_RUNTIME_SINGLE_OUTFIX 2 + +/** + * \brief Runtime structure header for Rose. + * + * Runtime structure header for Rose. + * In memory, we follow this with: + * -# the "engine blob" + * -# anchored 'literal' matcher table + * -# floating literal matcher table + * -# eod-anchored literal matcher table + * -# small block table + * -# array of NFA offsets, one per queue + * -# array of state offsets, one per queue (+) + * + * (+) stateOffset array note: Offsets in the array are either into the stream + * state (normal case) or into the tstate region of scratch (for transient rose + * nfas). Rose nfa info table can distinguish the cases. + */ +struct RoseEngine { + u8 pureLiteral; /* Indicator of pure literal API */ + u8 noFloatingRoots; /* only need to run the anchored table if something + * matched in the anchored table */ + u8 requiresEodCheck; /* stuff happens at eod time */ + u8 hasOutfixesInSmallBlock; /**< has at least one outfix that must run even + in small block scans. */ + u8 runtimeImpl; /**< can we just run the floating table or a single outfix? + * or do we need a full rose? */ + u8 mpvTriggeredByLeaf; /**< need to check (suf|out)fixes for mpv trigger */ + u8 canExhaust; /**< every pattern has an exhaustion key */ + u8 hasSom; /**< has at least one pattern which tracks SOM. */ + u8 somHorizon; /**< width in bytes of SOM offset storage (governed by + SOM precision) */ + u32 mode; /**< scanning mode, one of HS_MODE_{BLOCK,STREAM,VECTORED} */ + u32 historyRequired; /**< max amount of history required for streaming */ + u32 ekeyCount; /**< number of exhaustion keys */ + u32 lkeyCount; /**< number of logical keys */ + u32 lopCount; /**< number of logical ops */ + u32 ckeyCount; /**< number of combination keys */ + u32 logicalTreeOffset; /**< offset to mapping from lkey to LogicalOp */ + u32 combInfoMapOffset; /**< offset to mapping from ckey to combInfo */ + u32 dkeyCount; /**< number of dedupe keys */ + u32 dkeyLogSize; /**< size of fatbit for storing dkey log (bytes) */ + u32 invDkeyOffset; /**< offset to table mapping from dkeys to the external + * report ids */ + u32 somLocationCount; /**< number of som locations required */ + u32 somLocationFatbitSize; /**< size of SOM location fatbit (bytes) */ + u32 rolesWithStateCount; // number of roles with entries in state bitset + u32 stateSize; /* size of the state bitset + * WARNING: not the size of the rose state */ + u32 anchorStateSize; /* size of the state for the anchor dfas */ + u32 tStateSize; /* total size of the state for transient rose nfas */ + u32 scratchStateSize; /**< uncompressed state req'd for NFAs in scratch; + * used for sizing scratch only. */ + u32 smallWriteOffset; /**< offset of small-write matcher */ + u32 amatcherOffset; // offset of the anchored literal matcher (bytes) + u32 ematcherOffset; // offset of the eod-anchored literal matcher (bytes) + u32 fmatcherOffset; // offset of the floating literal matcher (bytes) + u32 drmatcherOffset; // offset of the delayed rebuild table (bytes) + u32 sbmatcherOffset; // offset of the small-block literal matcher (bytes) + u32 longLitTableOffset; // offset of the long literal table + u32 amatcherMinWidth; /**< minimum number of bytes required for a pattern + * involved with the anchored table to produce a full + * match. */ + u32 fmatcherMinWidth; /**< minimum number of bytes required for a pattern + * involved with the floating table to produce a full + * match. */ + u32 eodmatcherMinWidth; /**< minimum number of bytes required for a pattern + * involved with the eod table to produce a full + * match. */ + u32 amatcherMaxBiAnchoredWidth; /**< maximum number of bytes that can still + * produce a match for a pattern involved + * with the anchored table. */ + u32 fmatcherMaxBiAnchoredWidth; /**< maximum number of bytes that can still + * produce a match for a pattern involved + * with the anchored table. */ + + /** + * \brief Offset of u32 array of program offsets for reports used by + * output-exposed engines. + */ + u32 reportProgramOffset; + + /** + * \brief Number of programs for reports used by output-exposed engines. + */ + u32 reportProgramCount; + + /** + * \brief Offset of u32 array of program offsets for delayed replay of + * literals. + */ + u32 delayProgramOffset; + + /** + * \brief Offset of u32 array of program offsets for anchored literals. + */ + u32 anchoredProgramOffset; + + u32 activeArrayCount; //number of nfas tracked in the active array + u32 activeLeftCount; //number of nfas tracked in the active rose array + u32 queueCount; /**< number of nfa queues */ + u32 activeQueueArraySize; //!< size of fatbit for active queues (bytes) + + u32 eagerIterOffset; /**< offset to sparse iter for eager prefixes or 0 if + * none */ + + /** \brief Number of keys used by CHECK_SET_HANDLED instructions in role + * programs. */ + u32 handledKeyCount; + + /** \brief Size of the handled keys fatbit in scratch (bytes). */ + u32 handledKeyFatbitSize; + + u32 leftOffset; + u32 roseCount; + + u32 eodProgramOffset; //!< EOD program, otherwise 0. + u32 flushCombProgramOffset; /**< FlushCombination program, otherwise 0 */ + u32 lastFlushCombProgramOffset; /**< LastFlushCombination program, + * otherwise 0 */ + + u32 lastByteHistoryIterOffset; // if non-zero + + /** \brief Minimum number of bytes required to match. */ + u32 minWidth; + + /** \brief Minimum number of bytes required to match, excluding boundary + * reports. */ + u32 minWidthExcludingBoundaries; + + u32 maxBiAnchoredWidth; /* ROSE_BOUND_INF if any non bianchored patterns + * present */ + u32 anchoredDistance; // region to run the anchored table over + u32 anchoredMinDistance; /* start of region to run anchored table over */ + u32 floatingDistance; /* end of region to run the floating table over + ROSE_BOUND_INF if not bounded */ + u32 floatingMinDistance; /* start of region to run floating table over */ + u32 smallBlockDistance; /* end of region to run the floating table over + ROSE_BOUND_INF if not bounded */ + u32 floatingMinLiteralMatchOffset; /* the minimum offset that we can get a + * 'valid' match from the floating + * table */ + u32 nfaInfoOffset; /* offset to the nfa info offset array */ + rose_group initialGroups; + rose_group floating_group_mask; /* groups that are used by the ftable */ + u32 size; // (bytes) + u32 delay_count; /* number of delayed literal ids. */ + u32 delay_fatbit_size; //!< size of each delay fatbit in scratch (bytes) + u32 anchored_count; /* number of anchored literal ids */ + u32 anchored_fatbit_size; //!< size of each anch fatbit in scratch (bytes) + u32 maxFloatingDelayedMatch; /* max offset that a delayed literal can + * usefully be reported */ + u32 delayRebuildLength; /* length of the history region which needs to be + * rescanned when we are doing a delayed literal + * rebuild scan. */ + struct RoseStateOffsets stateOffsets; + struct RoseBoundaryReports boundary; + u32 totalNumLiterals; /* total number of literals including dr */ + u32 asize; /* size of the atable */ + u32 outfixBeginQueue; /* first outfix queue */ + u32 outfixEndQueue; /* one past the last outfix queue */ + u32 leftfixBeginQueue; /* first prefix/infix queue */ + u32 initMpvNfa; /* (allegedly chained) mpv to force on at init */ + u32 rosePrefixCount; /* number of rose prefixes */ + u32 activeLeftIterOffset; /* mmbit_sparse_iter over non-transient roses */ + u32 ematcherRegionSize; /* max region size to pass to ematcher */ + u32 somRevCount; /**< number of som reverse nfas */ + u32 somRevOffsetOffset; /**< offset to array of offsets to som rev nfas */ + u32 longLitStreamState; // size in bytes + + struct scatter_full_plan state_init; +}; + +struct ALIGN_CL_DIRECTIVE anchored_matcher_info { + u32 next_offset; /* relative to this, 0 for end */ + u32 state_offset; /* relative to anchorState */ + u32 anchoredMinDistance; /* start of region to run anchored table over */ +}; + +/** + * \brief Long literal subtable for a particular mode (caseful or nocase). + */ +struct RoseLongLitSubtable { + /** + * \brief Offset of the hash table (relative to RoseLongLitTable base). + * + * Offset is zero if no such table exists. + */ + u32 hashOffset; + + /** + * \brief Offset of the bloom filter (relative to RoseLongLitTable base). + * + * Offset is zero if no such table exists. + */ + u32 bloomOffset; + + /** \brief lg2 of the size of the hash table. */ + u8 hashBits; + + /** \brief Size of the bloom filter in bits. */ + u8 bloomBits; + + /** \brief Number of bits of packed stream state used. */ + u8 streamStateBits; +}; + +/** + * \brief Long literal table header. + */ +struct RoseLongLitTable { + /** + * \brief Total size of the whole table (including strings, bloom filters, + * hash tables). + */ + u32 size; + + /** \brief Caseful sub-table (hash table and bloom filter). */ + struct RoseLongLitSubtable caseful; + + /** \brief Caseless sub-table (hash table and bloom filter). */ + struct RoseLongLitSubtable nocase; + + /** \brief Total size of packed stream state in bytes. */ + u8 streamStateBytes; + + /** \brief Max length of literal prefixes. */ + u8 maxLen; +}; + +/** + * \brief One of these structures per hash table entry in our long literal + * table. + */ +struct RoseLongLitHashEntry { + /** + * \brief Offset of the literal string itself, relative to + * RoseLongLitTable base. Zero if this bucket is empty. + */ + u32 str_offset; + + /** \brief Length of the literal string. */ + u32 str_len; +}; + +static really_inline +const struct anchored_matcher_info *getALiteralMatcher( + const struct RoseEngine *t) { + if (!t->amatcherOffset) { + return NULL; + } + + const char *lt = (const char *)t + t->amatcherOffset; + assert(ISALIGNED_CL(lt)); + return (const struct anchored_matcher_info *)lt; +} + +struct HWLM; + +static really_inline +const struct HWLM *getFLiteralMatcher(const struct RoseEngine *t) { + if (!t->fmatcherOffset) { + return NULL; + } + + const char *lt = (const char *)t + t->fmatcherOffset; + assert(ISALIGNED_CL(lt)); + return (const struct HWLM *)lt; +} + +static really_inline +const void *getSBLiteralMatcher(const struct RoseEngine *t) { + if (!t->sbmatcherOffset) { + return NULL; + } + + const char *matcher = (const char *)t + t->sbmatcherOffset; + assert(ISALIGNED_N(matcher, 8)); + return matcher; +} + +static really_inline +const struct LeftNfaInfo *getLeftTable(const struct RoseEngine *t) { + const struct LeftNfaInfo *r + = (const struct LeftNfaInfo *)((const char *)t + t->leftOffset); + assert(ISALIGNED_N(r, 4)); + return r; +} + +struct mmbit_sparse_iter; // forward decl + +static really_inline +const struct mmbit_sparse_iter *getActiveLeftIter(const struct RoseEngine *t) { + assert(t->activeLeftIterOffset); + const struct mmbit_sparse_iter *it = (const struct mmbit_sparse_iter *) + ((const char *)t + t->activeLeftIterOffset); + assert(ISALIGNED_N(it, 4)); + return it; +} + +static really_inline +const struct NfaInfo *getNfaInfoByQueue(const struct RoseEngine *t, u32 qi) { + const struct NfaInfo *infos + = (const struct NfaInfo *)((const char *)t + t->nfaInfoOffset); + assert(ISALIGNED_N(infos, sizeof(u32))); + + return &infos[qi]; +} + +static really_inline +const struct NFA *getNfaByInfo(const struct RoseEngine *t, + const struct NfaInfo *info) { + return (const struct NFA *)((const char *)t + info->nfaOffset); +} + +static really_inline +const struct NFA *getNfaByQueue(const struct RoseEngine *t, u32 qi) { + const struct NfaInfo *info = getNfaInfoByQueue(t, qi); + return getNfaByInfo(t, info); +} + +static really_inline +u32 queueToLeftIndex(const struct RoseEngine *t, u32 qi) { + assert(qi >= t->leftfixBeginQueue); + return qi - t->leftfixBeginQueue; +} + +static really_inline +const struct LeftNfaInfo *getLeftInfoByQueue(const struct RoseEngine *t, + u32 qi) { + const struct LeftNfaInfo *infos = getLeftTable(t); + return &infos[queueToLeftIndex(t, qi)]; +} + +struct SmallWriteEngine; + +static really_inline +const struct SmallWriteEngine *getSmallWrite(const struct RoseEngine *t) { + if (!t->smallWriteOffset) { + return NULL; + } + + const struct SmallWriteEngine *smwr = + (const struct SmallWriteEngine *)((const char *)t + t->smallWriteOffset); + return smwr; +} + +#endif // ROSE_INTERNAL_H diff --git a/regex/rose/rose_program.h b/regex/rose/rose_program.h new file mode 100644 index 000000000..7e21303cb --- /dev/null +++ b/regex/rose/rose_program.h @@ -0,0 +1,724 @@ +/* + * Copyright (c) 2015-2020, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Rose data structures to do with role programs. + */ + +#ifndef ROSE_ROSE_PROGRAM_H +#define ROSE_ROSE_PROGRAM_H + +#include "som/som_operation.h" +#include "rose_internal.h" +#include "ue2common.h" +#include "util/simd_types.h" + +/** \brief Minimum alignment for each instruction in memory. */ +#define ROSE_INSTR_MIN_ALIGN 8U + +/** \brief Role program instruction opcodes. */ +enum RoseInstructionCode { + ROSE_INSTR_END, //!< End of program. + ROSE_INSTR_ANCHORED_DELAY, //!< Delay until after anchored matcher. + ROSE_INSTR_CHECK_LIT_EARLY, //!< Skip matches before floating min offset. + ROSE_INSTR_CHECK_GROUPS, //!< Check that literal groups are on. + ROSE_INSTR_CHECK_ONLY_EOD, //!< Role matches only at EOD. + ROSE_INSTR_CHECK_BOUNDS, //!< Bounds on distance from offset 0. + ROSE_INSTR_CHECK_NOT_HANDLED, //!< Test & set role in "handled". + ROSE_INSTR_CHECK_SINGLE_LOOKAROUND, //!< Single lookaround check. + ROSE_INSTR_CHECK_LOOKAROUND, //!< Lookaround check. + ROSE_INSTR_CHECK_MASK, //!< 8-bytes mask check. + ROSE_INSTR_CHECK_MASK_32, //!< 32-bytes and/cmp/neg mask check. + ROSE_INSTR_CHECK_BYTE, //!< Single Byte check. + ROSE_INSTR_CHECK_SHUFTI_16x8, //!< Check 16-byte data by 8-bucket shufti. + ROSE_INSTR_CHECK_SHUFTI_32x8, //!< Check 32-byte data by 8-bucket shufti. + ROSE_INSTR_CHECK_SHUFTI_16x16, //!< Check 16-byte data by 16-bucket shufti. + ROSE_INSTR_CHECK_SHUFTI_32x16, //!< Check 32-byte data by 16-bucket shufti. + ROSE_INSTR_CHECK_INFIX, //!< Infix engine must be in accept state. + ROSE_INSTR_CHECK_PREFIX, //!< Prefix engine must be in accept state. + ROSE_INSTR_PUSH_DELAYED, //!< Push delayed literal matches. + ROSE_INSTR_DUMMY_NOP, //!< NOP. Should not exist in build programs. + ROSE_INSTR_CATCH_UP, //!< Catch up engines, anchored matches. + ROSE_INSTR_CATCH_UP_MPV, //!< Catch up the MPV. + ROSE_INSTR_SOM_ADJUST, //!< Set SOM from a distance to EOM. + ROSE_INSTR_SOM_LEFTFIX, //!< Acquire SOM from a leftfix engine. + ROSE_INSTR_SOM_FROM_REPORT, //!< Acquire SOM from a som_operation. + ROSE_INSTR_SOM_ZERO, //!< Set SOM to zero. + ROSE_INSTR_TRIGGER_INFIX, //!< Trigger an infix engine. + ROSE_INSTR_TRIGGER_SUFFIX, //!< Trigger a suffix engine. + ROSE_INSTR_DEDUPE, //!< Run deduplication for report. + ROSE_INSTR_DEDUPE_SOM, //!< Run deduplication for SOM report. + ROSE_INSTR_REPORT_CHAIN, //!< Fire a chained report (MPV). + ROSE_INSTR_REPORT_SOM_INT, //!< Manipulate SOM only. + ROSE_INSTR_REPORT_SOM_AWARE, //!< Manipulate SOM from SOM-aware source. + + /** \brief Fire a report. */ + ROSE_INSTR_REPORT, + + /** \brief Fire an exhaustible report. */ + ROSE_INSTR_REPORT_EXHAUST, + + /** \brief Fire a SOM report. */ + ROSE_INSTR_REPORT_SOM, + + /** \brief Fire an exhaustible SOM report. */ + ROSE_INSTR_REPORT_SOM_EXHAUST, + + /** \brief Super-instruction combining DEDUPE and REPORT. */ + ROSE_INSTR_DEDUPE_AND_REPORT, + + /** + * \brief Fire a report and stop program execution. This is a + * specialisation intended for short, frequently-executed programs. + */ + ROSE_INSTR_FINAL_REPORT, + + ROSE_INSTR_CHECK_EXHAUSTED, //!< Check if an ekey has already been set. + ROSE_INSTR_CHECK_MIN_LENGTH, //!< Check (EOM - SOM) against min length. + ROSE_INSTR_SET_STATE, //!< Switch a state index on. + ROSE_INSTR_SET_GROUPS, //!< Set some literal group bits. + ROSE_INSTR_SQUASH_GROUPS, //!< Conditionally turn off some groups. + ROSE_INSTR_CHECK_STATE, //!< Test a single bit in the state multibit. + ROSE_INSTR_SPARSE_ITER_BEGIN, //!< Begin running a sparse iter over states. + ROSE_INSTR_SPARSE_ITER_NEXT, //!< Continue running sparse iter over states. + ROSE_INSTR_SPARSE_ITER_ANY, //!< Test for any bit in the sparse iterator. + + /** \brief Check outfixes and suffixes for EOD and fire reports if so. */ + ROSE_INSTR_ENGINES_EOD, + + /** \brief Catch up and check active suffixes for EOD and fire reports if + * so. */ + ROSE_INSTR_SUFFIXES_EOD, + + /** \brief Run the EOD-anchored HWLM literal matcher. */ + ROSE_INSTR_MATCHER_EOD, + + /** + * \brief Confirm a case-sensitive literal at the current offset. In + * streaming mode, this makes use of the long literal table. + */ + ROSE_INSTR_CHECK_LONG_LIT, + + /** + * \brief Confirm a case-insensitive literal at the current offset. In + * streaming mode, this makes use of the long literal table. + */ + ROSE_INSTR_CHECK_LONG_LIT_NOCASE, + + /** + * \brief Confirm a case-sensitive "medium length" literal at the current + * offset. In streaming mode, this will check history if needed. + */ + ROSE_INSTR_CHECK_MED_LIT, + + /** + * \brief Confirm a case-insensitive "medium length" literal at the current + * offset. In streaming mode, this will check history if needed. + */ + ROSE_INSTR_CHECK_MED_LIT_NOCASE, + + /** + * \brief Clear the "work done" flag used by the SQUASH_GROUPS instruction. + */ + ROSE_INSTR_CLEAR_WORK_DONE, + + /** \brief Check lookaround if it has multiple paths. */ + ROSE_INSTR_MULTIPATH_LOOKAROUND, + + /** + * \brief Use shufti to check lookaround with multiple paths. The total + * length of the paths is 16 bytes at most and shufti has 8 buckets. + * All paths can be at most 16 bytes long. + */ + ROSE_INSTR_CHECK_MULTIPATH_SHUFTI_16x8, + + /** + * \brief Use shufti to check lookaround with multiple paths. The total + * length of the paths is 32 bytes at most and shufti has 8 buckets. + * All paths can be at most 16 bytes long. + */ + ROSE_INSTR_CHECK_MULTIPATH_SHUFTI_32x8, + + /** + * \brief Use shufti to check lookaround with multiple paths. The total + * length of the paths is 32 bytes at most and shufti has 16 buckets. + * All paths can be at most 16 bytes long. + */ + ROSE_INSTR_CHECK_MULTIPATH_SHUFTI_32x16, + + /** + * \brief Use shufti to check multiple paths lookaround. The total + * length of the paths is 64 bytes at most and shufti has 8 buckets. + * All paths can be at most 16 bytes long. + */ + ROSE_INSTR_CHECK_MULTIPATH_SHUFTI_64, + + /** + * \brief Jump to the program of included literal. + */ + ROSE_INSTR_INCLUDED_JUMP, + + /** + * \brief Set matching status of a sub-expression. + */ + ROSE_INSTR_SET_LOGICAL, + + /** + * \brief Set combination status pending checking. + */ + ROSE_INSTR_SET_COMBINATION, + + /** + * \brief Check if compliant with any logical constraints. + */ + ROSE_INSTR_FLUSH_COMBINATION, + + /** \brief Mark as exhausted instead of report while quiet. */ + ROSE_INSTR_SET_EXHAUST, + + /** + * \brief Calculate any combination's logical value if none of its + * sub-expression matches until EOD, then check if compliant with any + * logical constraints. + */ + ROSE_INSTR_LAST_FLUSH_COMBINATION, + + ROSE_INSTR_CHECK_SHUFTI_64x8, //!< Check 64-byte data by 8-bucket shufti. + ROSE_INSTR_CHECK_SHUFTI_64x16, //!< Check 64-byte data by 16-bucket shufti. + ROSE_INSTR_CHECK_MASK_64, //!< 64-bytes and/cmp/neg mask check. + + LAST_ROSE_INSTRUCTION = ROSE_INSTR_CHECK_MASK_64 //!< Sentinel. +}; + +struct ROSE_STRUCT_END { + u8 code; //!< From enum RoseInstructionCode. +}; + +struct ROSE_STRUCT_ANCHORED_DELAY { + u8 code; //!< From enum RoseInstructionCode. + rose_group groups; //!< Bitmask. + u32 anch_id; //!< Program to restart after the delay. + u32 done_jump; //!< Jump forward this many bytes if we have to delay. +}; + +struct ROSE_STRUCT_CHECK_LIT_EARLY { + u8 code; //!< From enum RoseInstructionCode. + u32 min_offset; //!< Minimum offset for this literal. + u32 fail_jump; //!< Jump forward this many bytes on failure. +}; + +/** Note: check failure will halt program. */ +struct ROSE_STRUCT_CHECK_GROUPS { + u8 code; //!< From enum RoseInstructionCode. + rose_group groups; //!< Bitmask. +}; + +struct ROSE_STRUCT_CHECK_ONLY_EOD { + u8 code; //!< From enum RoseInstructionCode. + u32 fail_jump; //!< Jump forward this many bytes on failure. +}; + +struct ROSE_STRUCT_CHECK_BOUNDS { + u8 code; //!< From enum RoseInstructionCode. + u64a min_bound; //!< Min distance from zero. + u64a max_bound; //!< Max distance from zero. + u32 fail_jump; //!< Jump forward this many bytes on failure. +}; + +struct ROSE_STRUCT_CHECK_NOT_HANDLED { + u8 code; //!< From enum RoseInstructionCode. + u32 key; //!< Key in the "handled_roles" fatbit in scratch. + u32 fail_jump; //!< Jump forward this many bytes if we have seen key before. +}; + +struct ROSE_STRUCT_CHECK_SINGLE_LOOKAROUND { + u8 code; //!< From enum RoseInstructionCode. + s8 offset; //!< The offset of the byte to examine. + u32 reach_index; //!< Index for lookaround reach bitvectors. + u32 fail_jump; //!< Jump forward this many bytes on failure. +}; + +struct ROSE_STRUCT_CHECK_LOOKAROUND { + u8 code; //!< From enum RoseInstructionCode. + u32 look_index; //!< Offset in bytecode of lookaround offset list. + u32 reach_index; //!< Offset in bytecode of lookaround reach bitvectors. + u32 count; //!< The count of lookaround entries in one instruction. + u32 fail_jump; //!< Jump forward this many bytes on failure. +}; + +struct ROSE_STRUCT_CHECK_MASK { + u8 code; //!< From enum roseInstructionCode. + u64a and_mask; //!< 8-byte and mask. + u64a cmp_mask; //!< 8-byte cmp mask. + u64a neg_mask; //!< 8-byte negation mask. + s32 offset; //!< Relative offset of the first byte. + u32 fail_jump; //!< Jump forward this many bytes on failure. +}; + +struct ROSE_STRUCT_CHECK_MASK_32 { + u8 code; //!< From enum RoseInstructionCode. + u8 and_mask[32]; //!< 32-byte and mask. + u8 cmp_mask[32]; //!< 32-byte cmp mask. + u32 neg_mask; //!< negation mask with 32 bits. + s32 offset; //!< Relative offset of the first byte. + u32 fail_jump; //!< Jump forward this many bytes on failure. +}; + +struct ROSE_STRUCT_CHECK_MASK_64 { + u8 code; //!< From enum RoseInstructionCode. + u8 and_mask[64]; //!< 64-byte and mask. + u8 cmp_mask[64]; //!< 64-byte cmp mask. + u64a neg_mask; //!< negation mask with 32 bits. + s32 offset; //!< Relative offset of the first byte. + u32 fail_jump; //!< Jump forward this many bytes on failure. +}; + +struct ROSE_STRUCT_CHECK_BYTE { + u8 code; //!< From enum RoseInstructionCode. + u8 and_mask; //!< 8-bits and mask. + u8 cmp_mask; //!< 8-bits cmp mask. + u8 negation; //!< Flag about negation. + s32 offset; //!< The relative offset. + u32 fail_jump; //!< Jump forward this many bytes on failure. +}; + +// Since m128 and m256 could be missaligned in the bytecode, +// we'll use u8[16] and u8[32] instead in all rose_check_shufti structures. +struct ROSE_STRUCT_CHECK_SHUFTI_16x8 { + u8 code; //!< From enum RoseInstructionCode. + u8 nib_mask[32]; //!< High 16 and low 16 bits nibble mask in shufti. + u8 bucket_select_mask[16]; //!< Mask for bucket assigning. + u32 neg_mask; //!< Negation mask in low 16 bits. + s32 offset; //!< Relative offset of the first byte. + u32 fail_jump; //!< Jump forward this many bytes on failure. +}; + +struct ROSE_STRUCT_CHECK_SHUFTI_32x8 { + u8 code; //!< From enum RoseInstructionCode. + u8 hi_mask[16]; //!< High nibble mask in shufti. + u8 lo_mask[16]; //!< Low nibble mask in shufti. + u8 bucket_select_mask[32]; //!< Mask for bucket assigning. + u32 neg_mask; //!< 32 bits negation mask. + s32 offset; //!< Relative offset of the first byte. + u32 fail_jump; //!< Jump forward this many bytes on failure. +}; + +struct ROSE_STRUCT_CHECK_SHUFTI_16x16 { + u8 code; //!< From enum RoseInstructionCode. + u8 hi_mask[32]; //!< High nibble mask in shufti. + u8 lo_mask[32]; //!< Low nibble mask in shufti. + u8 bucket_select_mask[32]; //!< Mask for bucket assigning. + u32 neg_mask; //!< Negation mask in low 16 bits. + s32 offset; //!< Relative offset of the first byte. + u32 fail_jump; //!< Jump forward this many bytes on failure. +}; + +struct ROSE_STRUCT_CHECK_SHUFTI_32x16 { + u8 code; //!< From enum RoseInstructionCode. + u8 hi_mask[32]; //!< High nibble mask in shufti. + u8 lo_mask[32]; //!< Low nibble mask in shufti. + u8 bucket_select_mask_hi[32]; //!< Bucket mask for high 8 buckets. + u8 bucket_select_mask_lo[32]; //!< Bucket mask for low 8 buckets. + u32 neg_mask; //!< 32 bits negation mask. + s32 offset; //!< Relative offset of the first byte. + u32 fail_jump; //!< Jump forward this many bytes on failure. +}; + +struct ROSE_STRUCT_CHECK_SHUFTI_64x8 { + u8 code; //!< From enum RoseInstructionCode. + u8 hi_mask[64]; //!< High nibble mask in shufti. + u8 lo_mask[64]; //!< Low nibble mask in shufti. + u8 bucket_select_mask[64]; //!< Mask for bucket assigning. + u64a neg_mask; //!< 64 bits negation mask. + s32 offset; //!< Relative offset of the first byte. + u32 fail_jump; //!< Jump forward this many bytes on failure. +}; + +struct ROSE_STRUCT_CHECK_SHUFTI_64x16 { + u8 code; //!< From enum RoseInstructionCode. + u8 hi_mask_1[64]; //!< 4 copies of 0-15 High nibble mask. + u8 hi_mask_2[64]; //!< 4 copies of 16-32 High nibble mask. + u8 lo_mask_1[64]; //!< 4 copies of 0-15 Low nibble mask. + u8 lo_mask_2[64]; //!< 4 copies of 16-32 Low nibble mask. + u8 bucket_select_mask_hi[64]; //!< Bucket mask for high 8 buckets. + u8 bucket_select_mask_lo[64]; //!< Bucket mask for low 8 buckets. + u64a neg_mask; //!< 64 bits negation mask. + s32 offset; //!< Relative offset of the first byte. + u32 fail_jump; //!< Jump forward this many bytes on failure. +}; + +struct ROSE_STRUCT_CHECK_INFIX { + u8 code; //!< From enum RoseInstructionCode. + u32 queue; //!< Queue of leftfix to check. + u32 lag; //!< Lag of leftfix for this case. + ReportID report; //!< ReportID of leftfix to check. + u32 fail_jump; //!< Jump forward this many bytes on failure. +}; + +struct ROSE_STRUCT_CHECK_PREFIX { + u8 code; //!< From enum RoseInstructionCode. + u32 queue; //!< Queue of leftfix to check. + u32 lag; //!< Lag of leftfix for this case. + ReportID report; //!< ReportID of leftfix to check. + u32 fail_jump; //!< Jump forward this many bytes on failure. +}; + +struct ROSE_STRUCT_PUSH_DELAYED { + u8 code; //!< From enum RoseInstructionCode. + u8 delay; // Number of bytes to delay. + u32 index; // Delay literal index (relative to first delay lit). +}; + +struct ROSE_STRUCT_DUMMY_NOP { + u8 code; //!< From enum RoseInstructionCode. +}; + +struct ROSE_STRUCT_CATCH_UP { + u8 code; //!< From enum RoseInstructionCode. +}; + +struct ROSE_STRUCT_CATCH_UP_MPV { + u8 code; //!< From enum RoseInstructionCode. +}; + +struct ROSE_STRUCT_SOM_ADJUST { + u8 code; //!< From enum RoseInstructionCode. + u32 distance; //!< Distance to EOM. +}; + +struct ROSE_STRUCT_SOM_LEFTFIX { + u8 code; //!< From enum RoseInstructionCode. + u32 queue; //!< Queue index of leftfix providing SOM. + u32 lag; //!< Lag of leftfix for this case. +}; + +struct ROSE_STRUCT_SOM_FROM_REPORT { + u8 code; //!< From enum RoseInstructionCode. + struct som_operation som; +}; + +struct ROSE_STRUCT_SOM_ZERO { + u8 code; //!< From enum RoseInstructionCode. +}; + +struct ROSE_STRUCT_TRIGGER_INFIX { + u8 code; //!< From enum RoseInstructionCode. + u8 cancel; //!< Cancels previous top event. + u32 queue; //!< Queue index of infix. + u32 event; //!< Queue event, from MQE_*. +}; + +struct ROSE_STRUCT_TRIGGER_SUFFIX { + u8 code; //!< From enum RoseInstructionCode. + u32 queue; //!< Queue index of suffix. + u32 event; //!< Queue event, from MQE_*. +}; + +struct ROSE_STRUCT_DEDUPE { + u8 code; //!< From enum RoseInstructionCode. + u8 quash_som; //!< Force SOM to zero for this report. + u32 dkey; //!< Dedupe key. + s32 offset_adjust; //!< Offset adjustment to apply to end offset. + u32 fail_jump; //!< Jump forward this many bytes on failure. +}; + +struct ROSE_STRUCT_DEDUPE_SOM { + u8 code; //!< From enum RoseInstructionCode. + u8 quash_som; //!< Force SOM to zero for this report. + u32 dkey; //!< Dedupe key. + s32 offset_adjust; //!< Offset adjustment to apply to end offset. + u32 fail_jump; //!< Jump forward this many bytes on failure. +}; + +struct ROSE_STRUCT_REPORT_CHAIN { + u8 code; //!< From enum RoseInstructionCode. + u32 event; //!< Queue event, from MQE_*. Must be a top. + + /** + * \brief Number of bytes behind us that we are allowed to squash + * identical top events on the queue. + */ + u64a top_squash_distance; +}; + +struct ROSE_STRUCT_REPORT_SOM_INT { + u8 code; //!< From enum RoseInstructionCode. + struct som_operation som; +}; + +struct ROSE_STRUCT_REPORT_SOM_AWARE { + u8 code; //!< From enum RoseInstructionCode. + struct som_operation som; +}; + +struct ROSE_STRUCT_REPORT { + u8 code; //!< From enum RoseInstructionCode. + ReportID onmatch; //!< Report ID to deliver to user. + s32 offset_adjust; //!< Offset adjustment to apply to end offset. +}; + +struct ROSE_STRUCT_REPORT_EXHAUST { + u8 code; //!< From enum RoseInstructionCode. + ReportID onmatch; //!< Report ID to deliver to user. + s32 offset_adjust; //!< Offset adjustment to apply to end offset. + u32 ekey; //!< Exhaustion key. +}; + +struct ROSE_STRUCT_REPORT_SOM { + u8 code; //!< From enum RoseInstructionCode. + ReportID onmatch; //!< Report ID to deliver to user. + s32 offset_adjust; //!< Offset adjustment to apply to end offset. +}; + +struct ROSE_STRUCT_REPORT_SOM_EXHAUST { + u8 code; //!< From enum RoseInstructionCode. + ReportID onmatch; //!< Report ID to deliver to user. + s32 offset_adjust; //!< Offset adjustment to apply to end offset. + u32 ekey; //!< Exhaustion key. +}; + +struct ROSE_STRUCT_DEDUPE_AND_REPORT { + u8 code; //!< From enum RoseInstructionCode. + u8 quash_som; //!< Force SOM to zero for this report. + u32 dkey; //!< Dedupe key. + ReportID onmatch; //!< Report ID to deliver to user. + s32 offset_adjust; //!< Offset adjustment to apply to end offset. + u32 fail_jump; //!< Jump forward this many bytes on failure. +}; + +struct ROSE_STRUCT_FINAL_REPORT { + u8 code; //!< From enum RoseInstructionCode. + ReportID onmatch; //!< Report ID to deliver to user. + s32 offset_adjust; //!< Offset adjustment to apply to end offset. +}; + +struct ROSE_STRUCT_CHECK_EXHAUSTED { + u8 code; //!< From enum RoseInstructionCode. + u32 ekey; //!< Exhaustion key to check. + u32 fail_jump; //!< Jump forward this many bytes on failure. +}; + +struct ROSE_STRUCT_CHECK_MIN_LENGTH { + u8 code; //!< From enum RoseInstructionCode. + s32 end_adj; //!< Offset adjustment to add to EOM first. + u64a min_length; //!< Minimum distance from SOM to EOM. + u32 fail_jump; //!< Jump forward this many bytes on failure. +}; + +struct ROSE_STRUCT_SET_STATE { + u8 code; //!< From enum RoseInstructionCode. + u32 index; //!< State index in multibit. +}; + +struct ROSE_STRUCT_SET_GROUPS { + u8 code; //!< From enum RoseInstructionCode. + rose_group groups; //!< Bitmask to OR into groups. +}; + +struct ROSE_STRUCT_SQUASH_GROUPS { + u8 code; //!< From enum RoseInstructionCode. + rose_group groups; //!< Bitmask to AND into groups. +}; + +struct ROSE_STRUCT_CHECK_STATE { + u8 code; //!< From enum RoseInstructionCode. + u32 index; //!< State index in the role multibit. + u32 fail_jump; //!< Jump forward this many bytes on failure. +}; + +/** + * Note that the offsets in the jump table are always relative to the start of + * the program, not the current instruction. + */ +struct ROSE_STRUCT_SPARSE_ITER_BEGIN { + u8 code; //!< From enum RoseInstructionCode. + u32 iter_offset; //!< Offset of mmbit_sparse_iter structure. + u32 jump_table; //!< Offset of jump table indexed by sparse iterator. + u32 fail_jump; //!< Jump forward this many bytes on failure. +}; + +/** + * Note that the offsets in the jump table are always relative to the start of + * the program, not the current instruction. + */ +struct ROSE_STRUCT_SPARSE_ITER_NEXT { + u8 code; //!< From enum RoseInstructionCode. + u32 iter_offset; //!< Offset of mmbit_sparse_iter structure. + u32 jump_table; //!< Offset of jump table indexed by sparse iterator. + u32 state; // Current state index. + u32 fail_jump; //!< Jump forward this many bytes on failure. +}; + +struct ROSE_STRUCT_SPARSE_ITER_ANY { + u8 code; //!< From enum RoseInstructionCode. + u32 iter_offset; //!< Offset of mmbit_sparse_iter structure. + u32 fail_jump; //!< Jump forward this many bytes on failure. +}; + +struct ROSE_STRUCT_ENGINES_EOD { + u8 code; //!< From enum RoseInstructionCode. + u32 iter_offset; //!< Offset of mmbit_sparse_iter structure. +}; + +struct ROSE_STRUCT_SUFFIXES_EOD { + u8 code; //!< From enum RoseInstructionCode. +}; + +struct ROSE_STRUCT_MATCHER_EOD { + u8 code; //!< From enum RoseInstructionCode. +}; + +struct ROSE_STRUCT_CHECK_LONG_LIT { + u8 code; //!< From enum RoseInstructionCode. + u32 lit_offset; //!< Offset of literal string. + u32 lit_length; //!< Length of literal string. + u32 fail_jump; //!< Jump forward this many bytes on failure. +}; + +struct ROSE_STRUCT_CHECK_LONG_LIT_NOCASE { + u8 code; //!< From enum RoseInstructionCode. + u32 lit_offset; //!< Offset of literal string. + u32 lit_length; //!< Length of literal string. + u32 fail_jump; //!< Jump forward this many bytes on failure. +}; + +struct ROSE_STRUCT_CHECK_MED_LIT { + u8 code; //!< From enum RoseInstructionCode. + u32 lit_offset; //!< Offset of literal string. + u32 lit_length; //!< Length of literal string. + u32 fail_jump; //!< Jump forward this many bytes on failure. +}; + +struct ROSE_STRUCT_CHECK_MED_LIT_NOCASE { + u8 code; //!< From enum RoseInstructionCode. + u32 lit_offset; //!< Offset of literal string. + u32 lit_length; //!< Length of literal string. + u32 fail_jump; //!< Jump forward this many bytes on failure. +}; + +struct ROSE_STRUCT_CLEAR_WORK_DONE { + u8 code; //!< From enum RoseInstructionCode. +}; + +struct ROSE_STRUCT_MULTIPATH_LOOKAROUND { + u8 code; //!< From enum RoseInstructionCode. + u32 look_index; //!< Offset in bytecode of lookaround offset list. + u32 reach_index; //!< Offset in bytecode of lookaround reach bitvectors. + u32 count; //!< The lookaround byte numbers for each path. + s32 last_start; //!< The latest start offset among 8 paths. + u8 start_mask[MULTIPATH_MAX_LEN]; /*!< Used to initialize path if left-most + * data is missed. */ + u32 fail_jump; //!< Jump forward this many bytes on failure. +}; + +struct ROSE_STRUCT_CHECK_MULTIPATH_SHUFTI_16x8 { + u8 code; //!< From enum RoseInstructionCode. + u8 nib_mask[2 * sizeof(m128)]; //!< High and low nibble mask in shufti. + u8 bucket_select_mask[sizeof(m128)]; //!< Mask for bucket assigning. + u8 data_select_mask[sizeof(m128)]; //!< Shuffle mask for data ordering. + u32 hi_bits_mask; //!< High-bits used in multi-path validation. + u32 lo_bits_mask; //!< Low-bits used in multi-path validation. + u32 neg_mask; //!< 64 bits negation mask. + s32 base_offset; //!< Relative offset of the first byte. + s32 last_start; //!< The latest start offset among 8 paths. + u32 fail_jump; //!< Jump forward this many bytes on failure. +}; + +struct ROSE_STRUCT_CHECK_MULTIPATH_SHUFTI_32x8 { + u8 code; //!< From enum RoseInstructionCode. + u8 hi_mask[sizeof(m128)]; //!< High nibble mask in shufti. + u8 lo_mask[sizeof(m128)]; //!< Low nibble mask in shufti. + u8 bucket_select_mask[sizeof(m256)]; //!< Mask for bucket assigning. + u8 data_select_mask[sizeof(m256)]; //!< Shuffle mask for data ordering. + u32 hi_bits_mask; //!< High-bits used in multi-path validation. + u32 lo_bits_mask; //!< Low-bits used in multi-path validation. + u32 neg_mask; //!< 64 bits negation mask. + s32 base_offset; //!< Relative offset of the first byte. + s32 last_start; //!< The latest start offset among 8 paths. + u32 fail_jump; //!< Jump forward this many bytes on failure. +}; + +struct ROSE_STRUCT_CHECK_MULTIPATH_SHUFTI_32x16 { + u8 code; //!< From enum RoseInstructionCode. + u8 hi_mask[sizeof(m256)]; //!< High nibble mask in shufti. + u8 lo_mask[sizeof(m256)]; //!< Low nibble mask in shufti. + u8 bucket_select_mask_hi[sizeof(m256)]; //!< Mask for bucket assigning. + u8 bucket_select_mask_lo[sizeof(m256)]; //!< Mask for bucket assigning. + u8 data_select_mask[sizeof(m256)]; //!< Shuffle mask for data ordering. + u32 hi_bits_mask; //!< High-bits used in multi-path validation. + u32 lo_bits_mask; //!< Low-bits used in multi-path validation. + u32 neg_mask; //!< 64 bits negation mask. + s32 base_offset; //!< Relative offset of the first byte. + s32 last_start; //!< The latest start offset among 8 paths. + u32 fail_jump; //!< Jump forward this many bytes on failure. +}; + +struct ROSE_STRUCT_CHECK_MULTIPATH_SHUFTI_64 { + u8 code; //!< From enum RoseInstructionCode. + u8 hi_mask[sizeof(m128)]; //!< High nibble mask in shufti. + u8 lo_mask[sizeof(m128)]; //!< Low nibble mask in shufti. + u8 bucket_select_mask[2 * sizeof(m256)]; //!< Mask for bucket assigning. + u8 data_select_mask[2 * sizeof(m256)]; //!< Shuffle mask for data ordering. + u64a hi_bits_mask; //!< High-bits used in multi-path validation. + u64a lo_bits_mask; //!< Low-bits used in multi-path validation. + u64a neg_mask; //!< 64 bits negation mask. + s32 base_offset; //!< Relative offset of the first byte. + s32 last_start; //!< The latest start offset among 8 paths. + u32 fail_jump; //!< Jump forward this many bytes on failure. +}; + +struct ROSE_STRUCT_INCLUDED_JUMP { + u8 code; //!< From enum RoseInstructionCode. + u8 squash; //!< FDR confirm squash mask for included literal. + u32 child_offset; //!< Program offset of included literal. +}; + +struct ROSE_STRUCT_SET_LOGICAL { + u8 code; //!< From enum RoseInstructionCode. + u32 lkey; //!< Logical key to set. + s32 offset_adjust; //!< offsetAdjust from struct Report triggers the flush. +}; + +struct ROSE_STRUCT_SET_COMBINATION { + u8 code; //!< From enum RoseInstructionCode. + u32 ckey; //!< Combination key to set. +}; + +struct ROSE_STRUCT_FLUSH_COMBINATION { + u8 code; //!< From enum RoseInstructionCode. +}; + +struct ROSE_STRUCT_SET_EXHAUST { + u8 code; //!< From enum RoseInstructionCode. + u32 ekey; //!< Exhaustion key. +}; + +struct ROSE_STRUCT_LAST_FLUSH_COMBINATION { + u8 code; //!< From enum RoseInstructionCode. +}; +#endif // ROSE_ROSE_PROGRAM_H diff --git a/regex/rose/rose_types.h b/regex/rose/rose_types.h new file mode 100644 index 000000000..9dcef1cef --- /dev/null +++ b/regex/rose/rose_types.h @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2015-2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Rose runtime types (callbacks, etc). + */ + +#ifndef ROSE_TYPES_H +#define ROSE_TYPES_H + +#include "ue2common.h" + +struct hs_scratch; + +/** + * \brief Continue without checking for exhaustion. + * + * \ref RoseCallback return value indicating that execution should continue and + * that it is not necessary to check if all reports have been exhausted. + */ +#define ROSE_CONTINUE_MATCHING_NO_EXHAUST 2 + +/** + * \brief The type for a Rose callback. + * + * \return + * - \ref MO_HALT_MATCHING if matching should terminate; + * - \ref MO_CONTINUE_MATCHING if matching should continue; + * - \ref ROSE_CONTINUE_MATCHING_NO_EXHAUST if matching should continue and no + * exhaustion is possible. + */ +typedef int (*RoseCallback)(u64a offset, ReportID id, + struct hs_scratch *scratch); + +/** + * \brief The type for a Rose callback which also tracks start of match. + * + * Behaves just like \ref RoseCallback except that it is provided with both a + * start and an end offset. + * + * \see RoseCallback + */ +typedef int (*RoseCallbackSom)(u64a from_offset, u64a to_offset, ReportID id, + struct hs_scratch *scratch); + +#endif diff --git a/regex/rose/runtime.h b/regex/rose/runtime.h new file mode 100644 index 000000000..5fbb2b741 --- /dev/null +++ b/regex/rose/runtime.h @@ -0,0 +1,160 @@ +/* + * Copyright (c) 2015-2018, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Runtime functions shared between various Rose runtime code. + */ + +#ifndef ROSE_RUNTIME_H +#define ROSE_RUNTIME_H + +#include "rose_internal.h" +#include "scratch.h" +#include "util/partial_store.h" + +/* + * ROSE STATE LAYOUT: + * + * - runtime status byte (halt status, delay rebuild dirty, etc) + * - rose state multibit + * - active leaf array (multibit) + * - active leftfix array (multibit) + * - leftfix lag table + * - anchored matcher state + * - literal groups + * - history buffer + * - exhausted bitvector + * - som slots, som multibit arrays + * - nfa stream state (for each nfa) + */ + +#define rose_inline really_inline + +/* Maximum offset that we will eagerly run prefixes to. Beyond this point, eager + * prefixes are always run in exactly the same way as normal prefixes. */ +#define EAGER_STOP_OFFSET 64 + + +static really_inline +const void *getByOffset(const struct RoseEngine *t, u32 offset) { + assert(offset < t->size); + return (const u8 *)t + offset; +} + +static really_inline +void *getRoleState(char *state) { + return state + ROSE_STATE_OFFSET_ROLE_MMBIT; +} + +/** \brief Fetch the active array for suffix nfas. */ +static really_inline +u8 *getActiveLeafArray(const struct RoseEngine *t, char *state) { + return (u8 *)(state + t->stateOffsets.activeLeafArray); +} + +/** \brief Fetch the active array for rose nfas. */ +static really_inline +u8 *getActiveLeftArray(const struct RoseEngine *t, char *state) { + return (u8 *)(state + t->stateOffsets.activeLeftArray); +} + +static really_inline +rose_group loadGroups(const struct RoseEngine *t, const char *state) { + return partial_load_u64a(state + t->stateOffsets.groups, + t->stateOffsets.groups_size); + +} + +static really_inline +void storeGroups(const struct RoseEngine *t, char *state, rose_group groups) { + partial_store_u64a(state + t->stateOffsets.groups, groups, + t->stateOffsets.groups_size); +} + +static really_inline +u8 *getLongLitState(const struct RoseEngine *t, char *state) { + return (u8 *)(state + t->stateOffsets.longLitState); +} + +static really_inline +u8 *getLeftfixLagTable(const struct RoseEngine *t, char *state) { + return (u8 *)(state + t->stateOffsets.leftfixLagTable); +} + +static really_inline +const u8 *getLeftfixLagTableConst(const struct RoseEngine *t, + const char *state) { + return (const u8 *)(state + t->stateOffsets.leftfixLagTable); +} + +static really_inline +u32 has_chained_nfas(const struct RoseEngine *t) { + return t->outfixBeginQueue; +} + +static really_inline +void updateLastMatchOffset(struct RoseContext *tctxt, u64a offset) { + DEBUG_PRINTF("match @%llu, last match @%llu\n", offset, + tctxt->lastMatchOffset); + + assert(offset >= tctxt->minMatchOffset); + assert(offset >= tctxt->lastMatchOffset); + tctxt->lastMatchOffset = offset; +} + +static really_inline +void updateLastCombMatchOffset(struct RoseContext *tctxt, u64a offset) { + DEBUG_PRINTF("match @%llu, last match @%llu\n", offset, + tctxt->lastCombMatchOffset); + + assert(offset >= tctxt->lastCombMatchOffset); + tctxt->lastCombMatchOffset = offset; +} + +static really_inline +void updateMinMatchOffset(struct RoseContext *tctxt, u64a offset) { + DEBUG_PRINTF("min match now @%llu, was @%llu\n", offset, + tctxt->minMatchOffset); + + assert(offset >= tctxt->minMatchOffset); + assert(offset >= tctxt->minNonMpvMatchOffset); + tctxt->minMatchOffset = offset; + tctxt->minNonMpvMatchOffset = offset; +} + +static really_inline +void updateMinMatchOffsetFromMpv(struct RoseContext *tctxt, u64a offset) { + DEBUG_PRINTF("min match now @%llu, was @%llu\n", offset, + tctxt->minMatchOffset); + + assert(offset >= tctxt->minMatchOffset); + assert(tctxt->minNonMpvMatchOffset >= tctxt->minMatchOffset); + tctxt->minMatchOffset = offset; + tctxt->minNonMpvMatchOffset = MAX(tctxt->minNonMpvMatchOffset, offset); +} +#endif diff --git a/regex/rose/stream.c b/regex/rose/stream.c new file mode 100644 index 000000000..26268dd57 --- /dev/null +++ b/regex/rose/stream.c @@ -0,0 +1,752 @@ +/* + * Copyright (c) 2015-2018, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "catchup.h" +#include "counting_miracle.h" +#include "infix.h" +#include "match.h" +#include "miracle.h" +#include "program_runtime.h" +#include "rose.h" +#include "rose_internal.h" +#include "stream_long_lit.h" +#include "hwlm/hwlm.h" +#include "nfa/mcclellan.h" +#include "nfa/nfa_api.h" +#include "nfa/nfa_api_queue.h" +#include "nfa/nfa_internal.h" +#include "util/fatbit.h" + +static rose_inline +void runAnchoredTableStream(const struct RoseEngine *t, const void *atable, + size_t alen, u64a offset, + struct hs_scratch *scratch) { + char *state_base = scratch->core_info.state + t->stateOffsets.anchorState; + const struct anchored_matcher_info *curr = atable; + + do { + DEBUG_PRINTF("--anchored nfa (+%u) no %u so %u\n", + curr->anchoredMinDistance, curr->next_offset, + curr->state_offset); + const struct NFA *nfa + = (const struct NFA *)((const char *)curr + sizeof(*curr)); + assert(ISALIGNED_CL(nfa)); + assert(isMcClellanType(nfa->type)); + + char *state = state_base + curr->state_offset; + + char start = 0; + size_t adj = 0; + + if (offset <= curr->anchoredMinDistance) { + adj = curr->anchoredMinDistance - offset; + if (adj >= alen) { + goto next_nfa; + } + + start = 1; + } else { + // (No state decompress necessary.) + if (nfa->type == MCCLELLAN_NFA_8) { + if (!*(u8 *)state) { + goto next_nfa; + } + } else { + if (!unaligned_load_u16(state)) { + goto next_nfa; + } + } + } + + if (nfa->type == MCCLELLAN_NFA_8) { + nfaExecMcClellan8_SimpStream(nfa, state, scratch->core_info.buf, + start, adj, alen, roseAnchoredCallback, + scratch); + } else { + nfaExecMcClellan16_SimpStream(nfa, state, scratch->core_info.buf, + start, adj, alen, + roseAnchoredCallback, scratch); + } + + next_nfa: + if (!curr->next_offset) { + break; + } + + curr = (const void *)((const char *)curr + curr->next_offset); + } while (1); +} + + +static really_inline +void saveStreamState(const struct NFA *nfa, struct mq *q, s64a loc) { + DEBUG_PRINTF("offset=%llu, length=%zu, hlength=%zu, loc=%lld\n", + q->offset, q->length, q->hlength, loc); + nfaQueueCompressState(nfa, q, loc); +} + +static really_inline +u8 getByteBefore(const struct core_info *ci, s64a sp) { + if (sp > 0) { // in main buffer + assert(sp <= (s64a)ci->len); + return ci->buf[sp - 1]; + } + // in history buffer + assert(-sp < (s64a)ci->hlen); + return ci->hbuf[ci->hlen + sp - 1]; +} + +/** \brief Return value for \ref roseScanForMiracles. */ +enum MiracleAction { + MIRACLE_DEAD, //!< kill off this engine + MIRACLE_SAVED, //!< engine has been caught up and state saved + MIRACLE_CONTINUE //!< continue running and catch up engine +}; + +static really_inline +enum MiracleAction roseScanForMiracles(const struct RoseEngine *t, char *state, + struct hs_scratch *scratch, u32 qi, + const struct LeftNfaInfo *left, + const struct NFA *nfa) { + struct core_info *ci = &scratch->core_info; + const u32 qCount = t->queueCount; + struct mq *q = scratch->queues + qi; + + const char q_active = fatbit_isset(scratch->aqa, qCount, qi); + DEBUG_PRINTF("q_active=%d\n", q_active); + + const s64a begin_loc = q_active ? q_cur_loc(q) : 0; + const s64a end_loc = ci->len; + + s64a miracle_loc; + if (roseMiracleOccurs(t, left, ci, begin_loc, end_loc, &miracle_loc)) { + goto found_miracle; + } + + if (roseCountingMiracleOccurs(t, left, ci, begin_loc, end_loc, + &miracle_loc)) { + goto found_miracle; + } + + DEBUG_PRINTF("no miracle\n"); + return MIRACLE_CONTINUE; + +found_miracle: + DEBUG_PRINTF("miracle at %lld\n", miracle_loc); + + if (left->infix) { + if (!q_active) { + DEBUG_PRINTF("killing infix\n"); + return MIRACLE_DEAD; + } + + DEBUG_PRINTF("skip q forward, %lld to %lld\n", begin_loc, miracle_loc); + q_skip_forward_to(q, miracle_loc); + if (q_last_type(q) == MQE_START) { + DEBUG_PRINTF("miracle caused infix to die\n"); + return MIRACLE_DEAD; + } + + DEBUG_PRINTF("re-init infix state\n"); + assert(q->items[q->cur].type == MQE_START); + q->items[q->cur].location = miracle_loc; + nfaQueueInitState(q->nfa, q); + } else { + if (miracle_loc > end_loc - t->historyRequired) { + char *streamState = state + getNfaInfoByQueue(t, qi)->stateOffset; + u64a offset = ci->buf_offset + miracle_loc; + u8 key = offset ? getByteBefore(ci, miracle_loc) : 0; + DEBUG_PRINTF("init state, key=0x%02x, offset=%llu\n", key, offset); + if (!nfaInitCompressedState(nfa, offset, streamState, key)) { + return MIRACLE_DEAD; + } + storeRoseDelay(t, state, left, (s64a)ci->len - miracle_loc); + return MIRACLE_SAVED; + } + + DEBUG_PRINTF("re-init prefix (skip %lld->%lld)\n", begin_loc, + miracle_loc); + if (!q_active) { + fatbit_set(scratch->aqa, qCount, qi); + initRoseQueue(t, qi, left, scratch); + } + q->cur = q->end = 0; + pushQueueAt(q, 0, MQE_START, miracle_loc); + pushQueueAt(q, 1, MQE_TOP, miracle_loc); + nfaQueueInitState(q->nfa, q); + } + + return MIRACLE_CONTINUE; +} + + +static really_inline +char roseCatchUpLeftfix(const struct RoseEngine *t, char *state, + struct hs_scratch *scratch, u32 qi, + const struct LeftNfaInfo *left) { + assert(!left->transient); // active roses only + + struct core_info *ci = &scratch->core_info; + const u32 qCount = t->queueCount; + struct mq *q = scratch->queues + qi; + const struct NFA *nfa = getNfaByQueue(t, qi); + + if (nfaSupportsZombie(nfa) + && ci->buf_offset /* prefix can be alive with no q */ + && !fatbit_isset(scratch->aqa, qCount, qi) + && isZombie(t, state, left)) { + DEBUG_PRINTF("yawn - zombie\n"); + return 1; + } + + if (left->stopTable) { + enum MiracleAction mrv = + roseScanForMiracles(t, state, scratch, qi, left, nfa); + switch (mrv) { + case MIRACLE_DEAD: + return 0; + case MIRACLE_SAVED: + return 1; + default: + assert(mrv == MIRACLE_CONTINUE); + break; + } + } + + if (!fatbit_set(scratch->aqa, qCount, qi)) { + initRoseQueue(t, qi, left, scratch); + + s32 sp; + if (ci->buf_offset) { + sp = -(s32)loadRoseDelay(t, state, left); + } else { + sp = 0; + } + + DEBUG_PRINTF("ci->len=%zu, sp=%d, historyRequired=%u\n", ci->len, sp, + t->historyRequired); + + if ( ci->len - sp + 1 < t->historyRequired) { + // we'll end up safely in the history region. + DEBUG_PRINTF("safely in history, skipping\n"); + storeRoseDelay(t, state, left, (s64a)ci->len - sp); + return 1; + } + + pushQueueAt(q, 0, MQE_START, sp); + if (left->infix || ci->buf_offset + sp > 0) { + loadStreamState(nfa, q, sp); + } else { + pushQueueAt(q, 1, MQE_TOP, sp); + nfaQueueInitState(nfa, q); + } + } else { + DEBUG_PRINTF("queue already active\n"); + if (q->end - q->cur == 1 && q_cur_type(q) == MQE_START) { + DEBUG_PRINTF("empty queue, start loc=%lld\n", q_cur_loc(q)); + s64a last_loc = q_cur_loc(q); + if (ci->len - last_loc + 1 < t->historyRequired) { + // we'll end up safely in the history region. + DEBUG_PRINTF("safely in history, saving state and skipping\n"); + saveStreamState(nfa, q, last_loc); + storeRoseDelay(t, state, left, (s64a)ci->len - last_loc); + return 1; + } + } + } + + // Determine whether the byte before last_loc will be in the history + // buffer on the next stream write. + s64a last_loc = q_last_loc(q); + s64a leftovers = ci->len - last_loc; + if (leftovers + 1 >= t->historyRequired) { + u32 catchup_offset = left->maxLag ? left->maxLag - 1 : 0; + last_loc = (s64a)ci->len - catchup_offset; + } + + if (left->infix) { + if (infixTooOld(q, last_loc)) { + DEBUG_PRINTF("infix died of old age\n"); + return 0; + } + reduceInfixQueue(q, last_loc, left->maxQueueLen, q->nfa->maxWidth); + } + + DEBUG_PRINTF("end scan at %lld\n", last_loc); + pushQueueNoMerge(q, MQE_END, last_loc); + +#ifdef DEBUG + debugQueue(q); +#endif + + char rv = nfaQueueExecRose(nfa, q, MO_INVALID_IDX); + if (!rv) { /* nfa is dead */ + DEBUG_PRINTF("died catching up to stream boundary\n"); + return 0; + } else { + DEBUG_PRINTF("alive, saving stream state\n"); + if (nfaSupportsZombie(nfa) && + nfaGetZombieStatus(nfa, q, last_loc) == NFA_ZOMBIE_ALWAYS_YES) { + DEBUG_PRINTF("not so fast - zombie\n"); + setAsZombie(t, state, left); + } else { + saveStreamState(nfa, q, last_loc); + storeRoseDelay(t, state, left, (s64a)ci->len - last_loc); + } + } + + return 1; +} + +static rose_inline +void roseCatchUpLeftfixes(const struct RoseEngine *t, char *state, + struct hs_scratch *scratch) { + if (!t->activeLeftIterOffset) { + // No sparse iter, no non-transient roses. + return; + } + + // As per UE-1629, we catch up leftfix engines to: + // * current position (last location in the queue, or last location we + // executed to if the queue is empty) if that position (and the byte + // before so we can decompress the stream state) will be in the history + // buffer on the next stream write; OR + // * (stream_boundary - max_delay) other + + u8 *ara = getActiveLeftArray(t, state); /* indexed by offsets into + * left_table */ + const u32 arCount = t->activeLeftCount; + const struct LeftNfaInfo *left_table = getLeftTable(t); + const struct mmbit_sparse_iter *it = getActiveLeftIter(t); + + struct mmbit_sparse_state si_state[MAX_SPARSE_ITER_STATES]; + + u32 idx = 0; + u32 ri = mmbit_sparse_iter_begin(ara, arCount, &idx, it, si_state); + for (; ri != MMB_INVALID; + ri = mmbit_sparse_iter_next(ara, arCount, ri, &idx, it, si_state)) { + const struct LeftNfaInfo *left = left_table + ri; + u32 qi = ri + t->leftfixBeginQueue; + DEBUG_PRINTF("leftfix %u of %u, maxLag=%u, infix=%d\n", ri, arCount, + left->maxLag, (int)left->infix); + if (!roseCatchUpLeftfix(t, state, scratch, qi, left)) { + DEBUG_PRINTF("removing rose %u from active list\n", ri); + DEBUG_PRINTF("groups old=%016llx mask=%016llx\n", + scratch->tctxt.groups, left->squash_mask); + scratch->tctxt.groups &= left->squash_mask; + mmbit_unset(ara, arCount, ri); + } + } +} + +// Saves out stream state for all our active suffix NFAs. +static rose_inline +void roseSaveNfaStreamState(const struct RoseEngine *t, char *state, + struct hs_scratch *scratch) { + struct mq *queues = scratch->queues; + u8 *aa = getActiveLeafArray(t, state); + u32 aaCount = t->activeArrayCount; + + if (scratch->tctxt.mpv_inactive) { + DEBUG_PRINTF("mpv is dead as a doornail\n"); + /* mpv if it exists is queue 0 */ + mmbit_unset(aa, aaCount, 0); + } + + for (u32 qi = mmbit_iterate(aa, aaCount, MMB_INVALID); qi != MMB_INVALID; + qi = mmbit_iterate(aa, aaCount, qi)) { + DEBUG_PRINTF("saving stream state for qi=%u\n", qi); + + struct mq *q = queues + qi; + + // If it's active, it should have an active queue (as we should have + // done some work!) + assert(fatbit_isset(scratch->aqa, t->queueCount, qi)); + + const struct NFA *nfa = getNfaByQueue(t, qi); + saveStreamState(nfa, q, q_cur_loc(q)); + } +} + +static rose_inline +void ensureStreamNeatAndTidy(const struct RoseEngine *t, char *state, + struct hs_scratch *scratch, size_t length, + u64a offset) { + struct RoseContext *tctxt = &scratch->tctxt; + + if (roseCatchUpTo(t, scratch, length + scratch->core_info.buf_offset) == + HWLM_TERMINATE_MATCHING) { + return; /* dead; no need to clean up state. */ + } + roseSaveNfaStreamState(t, state, scratch); + roseCatchUpLeftfixes(t, state, scratch); + roseFlushLastByteHistory(t, scratch, offset + length); + tctxt->lastEndOffset = offset + length; + storeGroups(t, state, tctxt->groups); + storeLongLiteralState(t, state, scratch); +} + +static really_inline +void do_rebuild(const struct RoseEngine *t, struct hs_scratch *scratch) { + assert(t->drmatcherOffset); + assert(!can_stop_matching(scratch)); + + const struct HWLM *hwlm = getByOffset(t, t->drmatcherOffset); + size_t len = MIN(scratch->core_info.hlen, t->delayRebuildLength); + const u8 *buf = scratch->core_info.hbuf + scratch->core_info.hlen - len; + DEBUG_PRINTF("BEGIN FLOATING REBUILD over %zu bytes\n", len); + + scratch->core_info.status &= ~STATUS_DELAY_DIRTY; + + hwlmExec(hwlm, buf, len, 0, roseDelayRebuildCallback, scratch, + scratch->tctxt.groups); + assert(!can_stop_matching(scratch)); +} + +static rose_inline +void runEagerPrefixesStream(const struct RoseEngine *t, + struct hs_scratch *scratch) { + if (!t->eagerIterOffset + || scratch->core_info.buf_offset >= EAGER_STOP_OFFSET) { + return; + } + + char *state = scratch->core_info.state; + u8 *ara = getActiveLeftArray(t, state); /* indexed by offsets into + * left_table */ + const u32 arCount = t->activeLeftCount; + const u32 qCount = t->queueCount; + const struct LeftNfaInfo *left_table = getLeftTable(t); + const struct mmbit_sparse_iter *it = getByOffset(t, t->eagerIterOffset); + + struct mmbit_sparse_state si_state[MAX_SPARSE_ITER_STATES]; + + u32 idx = 0; + u32 ri = mmbit_sparse_iter_begin(ara, arCount, &idx, it, si_state); + for (; ri != MMB_INVALID; + ri = mmbit_sparse_iter_next(ara, arCount, ri, &idx, it, si_state)) { + const struct LeftNfaInfo *left = left_table + ri; + u32 qi = ri + t->leftfixBeginQueue; + DEBUG_PRINTF("leftfix %u of %u, maxLag=%u\n", ri, arCount, left->maxLag); + + assert(!fatbit_isset(scratch->aqa, qCount, qi)); + assert(left->eager); + assert(!left->infix); + + struct mq *q = scratch->queues + qi; + const struct NFA *nfa = getNfaByQueue(t, qi); + s64a loc = MIN(scratch->core_info.len, + EAGER_STOP_OFFSET - scratch->core_info.buf_offset); + + fatbit_set(scratch->aqa, qCount, qi); + initRoseQueue(t, qi, left, scratch); + + if (scratch->core_info.buf_offset) { + s64a sp = left->transient ? -(s64a)scratch->core_info.hlen + : -(s64a)loadRoseDelay(t, state, left); + pushQueueAt(q, 0, MQE_START, sp); + if (scratch->core_info.buf_offset + sp > 0) { + loadStreamState(nfa, q, sp); + /* if the leftfix fix is currently in a match state, we cannot + * advance it. */ + if (nfaInAnyAcceptState(nfa, q)) { + continue; + } + pushQueueAt(q, 1, MQE_END, loc); + } else { + pushQueueAt(q, 1, MQE_TOP, sp); + pushQueueAt(q, 2, MQE_END, loc); + nfaQueueInitState(q->nfa, q); + } + } else { + pushQueueAt(q, 0, MQE_START, 0); + pushQueueAt(q, 1, MQE_TOP, 0); + pushQueueAt(q, 2, MQE_END, loc); + nfaQueueInitState(nfa, q); + } + + char alive = nfaQueueExecToMatch(q->nfa, q, loc); + + if (!alive) { + DEBUG_PRINTF("queue %u dead, squashing\n", qi); + mmbit_unset(ara, arCount, ri); + fatbit_unset(scratch->aqa, qCount, qi); + scratch->tctxt.groups &= left->squash_mask; + } else if (q->cur == q->end) { + assert(alive != MO_MATCHES_PENDING); + /* unlike in block mode we cannot squash groups if there is no match + * in this block as we need the groups on for later stream writes */ + /* TODO: investigate possibility of a method to suppress groups for + * a single stream block. */ + DEBUG_PRINTF("queue %u finished, nfa lives\n", qi); + q->cur = q->end = 0; + pushQueueAt(q, 0, MQE_START, loc); + } else { + assert(alive == MO_MATCHES_PENDING); + DEBUG_PRINTF("queue %u unfinished, nfa lives\n", qi); + q->end--; /* remove end item */ + } + } +} + +static really_inline +int can_never_match(const struct RoseEngine *t, char *state, + struct hs_scratch *scratch, size_t length, u64a offset) { + struct RoseContext *tctxt = &scratch->tctxt; + + if (tctxt->groups) { + DEBUG_PRINTF("still has active groups\n"); + return 0; + } + + if (offset + length <= t->anchoredDistance) { /* not < as may have eod */ + DEBUG_PRINTF("still in anchored region\n"); + return 0; + } + + if (t->lastByteHistoryIterOffset) { /* last byte history is hard */ + DEBUG_PRINTF("last byte history\n"); + return 0; + } + + if (mmbit_any(getActiveLeafArray(t, state), t->activeArrayCount)) { + DEBUG_PRINTF("active leaf\n"); + return 0; + } + + return 1; +} + +void roseStreamExec(const struct RoseEngine *t, struct hs_scratch *scratch) { + DEBUG_PRINTF("OH HAI [%llu, %llu)\n", scratch->core_info.buf_offset, + scratch->core_info.buf_offset + (u64a)scratch->core_info.len); + assert(t); + assert(scratch->core_info.hbuf); + assert(scratch->core_info.buf); + + // We should not have been called if we've already been told to terminate + // matching. + assert(!told_to_stop_matching(scratch)); + + assert(mmbit_sparse_iter_state_size(t->rolesWithStateCount) + < MAX_SPARSE_ITER_STATES); + + size_t length = scratch->core_info.len; + u64a offset = scratch->core_info.buf_offset; + + // We may have a maximum width (for engines constructed entirely + // of bi-anchored patterns). If this write would result in us progressing + // beyond this point, we cannot possibly match. + if (t->maxBiAnchoredWidth != ROSE_BOUND_INF + && offset + length > t->maxBiAnchoredWidth) { + DEBUG_PRINTF("bailing, write would progress beyond maxBAWidth\n"); + return; + } + + char *state = scratch->core_info.state; + + struct RoseContext *tctxt = &scratch->tctxt; + tctxt->mpv_inactive = 0; + tctxt->groups = loadGroups(t, state); + tctxt->lit_offset_adjust = offset + 1; // index after last byte + tctxt->delayLastEndOffset = offset; + tctxt->lastEndOffset = offset; + tctxt->filledDelayedSlots = 0; + tctxt->lastMatchOffset = 0; + tctxt->lastCombMatchOffset = offset; + tctxt->minMatchOffset = offset; + tctxt->minNonMpvMatchOffset = offset; + tctxt->next_mpv_offset = 0; + + DEBUG_PRINTF("BEGIN: history len=%zu, buffer len=%zu groups=%016llx\n", + scratch->core_info.hlen, scratch->core_info.len, tctxt->groups); + + fatbit_clear(scratch->aqa); + scratch->al_log_sum = 0; + scratch->catchup_pq.qm_size = 0; + + if (t->outfixBeginQueue != t->outfixEndQueue) { + streamInitSufPQ(t, state, scratch); + } + + runEagerPrefixesStream(t, scratch); + + u32 alen = t->anchoredDistance > offset ? + MIN(length + offset, t->anchoredDistance) - offset : 0; + + const struct anchored_matcher_info *atable = getALiteralMatcher(t); + if (atable && alen) { + DEBUG_PRINTF("BEGIN ANCHORED %zu/%u\n", scratch->core_info.hlen, alen); + runAnchoredTableStream(t, atable, alen, offset, scratch); + + if (can_stop_matching(scratch)) { + goto exit; + } + } + + const struct HWLM *ftable = getFLiteralMatcher(t); + if (ftable) { + // Load in long literal table state and set up "fake history" buffers + // (ll_buf, etc, used by the CHECK_LONG_LIT instruction). Note that this + // must be done here in order to ensure that it happens before any path + // that leads to storeLongLiteralState(), which relies on these buffers. + loadLongLiteralState(t, state, scratch); + + if (t->noFloatingRoots && !roseHasInFlightMatches(t, state, scratch)) { + DEBUG_PRINTF("skip FLOATING: no inflight matches\n"); + goto flush_delay_and_exit; + } + + size_t flen = length; + if (t->floatingDistance != ROSE_BOUND_INF) { + flen = t->floatingDistance > offset ? + MIN(t->floatingDistance, length + offset) - offset : 0; + } + + size_t hlength = scratch->core_info.hlen; + + char rebuild = hlength && + (scratch->core_info.status & STATUS_DELAY_DIRTY) && + (t->maxFloatingDelayedMatch == ROSE_BOUND_INF || + offset < t->maxFloatingDelayedMatch); + DEBUG_PRINTF("**rebuild %hhd status %hhu mfdm %u, offset %llu\n", + rebuild, scratch->core_info.status, + t->maxFloatingDelayedMatch, offset); + + if (rebuild) { /* rebuild floating delayed match stuff */ + do_rebuild(t, scratch); + } + + if (!flen) { + goto flush_delay_and_exit; + } + + if (flen + offset <= t->floatingMinDistance) { + DEBUG_PRINTF("skip FLOATING: before floating min\n"); + goto flush_delay_and_exit; + } + + size_t start = 0; + if (offset < t->floatingMinDistance) { + // This scan crosses the floating min distance, so we can use that + // to set HWLM's "start" offset. + start = t->floatingMinDistance - offset; + } + DEBUG_PRINTF("start=%zu\n", start); + + DEBUG_PRINTF("BEGIN FLOATING (over %zu/%zu)\n", flen, length); + hwlmExecStreaming(ftable, flen, start, roseFloatingCallback, scratch, + tctxt->groups & t->floating_group_mask); + } + +flush_delay_and_exit: + DEBUG_PRINTF("flushing floating\n"); + if (cleanUpDelayed(t, scratch, length, offset) == HWLM_TERMINATE_MATCHING) { + return; + } + +exit: + DEBUG_PRINTF("CLEAN UP TIME\n"); + if (!can_stop_matching(scratch)) { + ensureStreamNeatAndTidy(t, state, scratch, length, offset); + } + + if (!told_to_stop_matching(scratch) + && can_never_match(t, state, scratch, length, offset)) { + DEBUG_PRINTF("PATTERN SET IS EXHAUSTED\n"); + scratch->core_info.status = STATUS_EXHAUSTED; + return; + } + + DEBUG_PRINTF("DONE STREAMING SCAN, status = %u\n", + scratch->core_info.status); + return; +} + +static rose_inline +void roseStreamInitEod(const struct RoseEngine *t, u64a offset, + struct hs_scratch *scratch) { + struct RoseContext *tctxt = &scratch->tctxt; + /* TODO: diff groups for eod */ + tctxt->groups = loadGroups(t, scratch->core_info.state); + tctxt->lit_offset_adjust = scratch->core_info.buf_offset + - scratch->core_info.hlen + + 1; // index after last byte + tctxt->delayLastEndOffset = offset; + tctxt->lastEndOffset = offset; + tctxt->filledDelayedSlots = 0; + tctxt->lastMatchOffset = 0; + tctxt->lastCombMatchOffset = offset; /* DO NOT set 0 here! */ + tctxt->minMatchOffset = offset; + tctxt->minNonMpvMatchOffset = offset; + tctxt->next_mpv_offset = offset; + + scratch->catchup_pq.qm_size = 0; + scratch->al_log_sum = 0; /* clear the anchored logs */ + + fatbit_clear(scratch->aqa); +} + +void roseStreamEodExec(const struct RoseEngine *t, u64a offset, + struct hs_scratch *scratch) { + assert(scratch); + assert(t->requiresEodCheck); + DEBUG_PRINTF("ci buf %p/%zu his %p/%zu\n", scratch->core_info.buf, + scratch->core_info.len, scratch->core_info.hbuf, + scratch->core_info.hlen); + + // We should not have been called if we've already been told to terminate + // matching. + assert(!told_to_stop_matching(scratch)); + + if (t->maxBiAnchoredWidth != ROSE_BOUND_INF + && offset > t->maxBiAnchoredWidth) { + DEBUG_PRINTF("bailing, we are beyond max width\n"); + /* also some of the history/state may be stale */ + return; + } + + if (!t->eodProgramOffset) { + DEBUG_PRINTF("no eod program\n"); + return; + } + + roseStreamInitEod(t, offset, scratch); + + DEBUG_PRINTF("running eod program at %u\n", t->eodProgramOffset); + + // There should be no pending delayed literals. + assert(!scratch->tctxt.filledDelayedSlots); + + const u64a som = 0; + const u8 flags = ROSE_PROG_FLAG_SKIP_MPV_CATCHUP; + + // Note: we ignore the result, as this is the last thing to ever happen on + // a scan. + roseRunProgram(t, scratch, t->eodProgramOffset, som, offset, flags); +} diff --git a/regex/rose/stream_long_lit.h b/regex/rose/stream_long_lit.h new file mode 100644 index 000000000..df9b57f4e --- /dev/null +++ b/regex/rose/stream_long_lit.h @@ -0,0 +1,372 @@ +/* + * Copyright (c) 2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef STREAM_LONG_LIT_H +#define STREAM_LONG_LIT_H + +#include "rose.h" +#include "rose_common.h" +#include "rose_internal.h" +#include "stream_long_lit_hash.h" +#include "util/compare.h" +#include "util/copybytes.h" + +static really_inline +const struct RoseLongLitHashEntry * +getHashTableBase(const struct RoseLongLitTable *ll_table, + const struct RoseLongLitSubtable *ll_sub) { + assert(ll_sub->hashOffset); + return (const struct RoseLongLitHashEntry *)((const char *)ll_table + + ll_sub->hashOffset); +} + +// Reads from stream state and unpacks values into stream state table. +static really_inline +void loadLongLitStreamState(const struct RoseLongLitTable *ll_table, + const u8 *ll_state, u32 *state_case, + u32 *state_nocase) { + assert(ll_table); + assert(ll_state); + assert(state_case && state_nocase); + + u8 ss_bytes = ll_table->streamStateBytes; + u8 ssb = ll_table->caseful.streamStateBits; + UNUSED u8 ssb_nc = ll_table->nocase.streamStateBits; + assert(ss_bytes == (ssb + ssb_nc + 7) / 8); + +#if defined(ARCH_32_BIT) + // On 32-bit hosts, we may be able to avoid having to do any u64a + // manipulation at all. + if (ss_bytes <= 4) { + u32 ssb_mask = (1U << ssb) - 1; + u32 streamVal = partial_load_u32(ll_state, ss_bytes); + *state_case = (u32)(streamVal & ssb_mask); + *state_nocase = (u32)(streamVal >> ssb); + return; + } +#endif + + u64a ssb_mask = (1ULL << ssb) - 1; + u64a streamVal = partial_load_u64a(ll_state, ss_bytes); + *state_case = (u32)(streamVal & ssb_mask); + *state_nocase = (u32)(streamVal >> ssb); +} + +static rose_inline +void loadLongLiteralStateMode(struct hs_scratch *scratch, + const struct RoseLongLitTable *ll_table, + const struct RoseLongLitSubtable *ll_sub, + const u32 state, const char nocase) { + if (!state) { + DEBUG_PRINTF("no state for %s\n", nocase ? "caseless" : "caseful"); + return; + } + + const struct RoseLongLitHashEntry *tab = getHashTableBase(ll_table, ll_sub); + const struct RoseLongLitHashEntry *ent = tab + state - 1; + + assert(ent->str_offset + ent->str_len <= ll_table->size); + const u8 *found_buf = (const u8 *)ll_table + ent->str_offset; + size_t found_sz = ent->str_len; + + struct RoseContext *tctxt = &scratch->tctxt; + if (nocase) { + tctxt->ll_buf_nocase = found_buf; + tctxt->ll_len_nocase = found_sz; + } else { + tctxt->ll_buf = found_buf; + tctxt->ll_len = found_sz; + } +} + +static rose_inline +void loadLongLiteralState(const struct RoseEngine *t, char *state, + struct hs_scratch *scratch) { + if (!t->longLitTableOffset) { + return; + } + + // If we don't have any long literals in play, these values must point to + // the real history buffer so that CHECK_LONG_LIT instructions examine the + // history buffer. + scratch->tctxt.ll_buf = scratch->core_info.hbuf; + scratch->tctxt.ll_len = scratch->core_info.hlen; + scratch->tctxt.ll_buf_nocase = scratch->core_info.hbuf; + scratch->tctxt.ll_len_nocase = scratch->core_info.hlen; + + if (!scratch->core_info.hlen) { + return; + } + + const struct RoseLongLitTable *ll_table = + getByOffset(t, t->longLitTableOffset); + const u8 *ll_state = getLongLitState(t, state); + + u32 state_case; + u32 state_nocase; + loadLongLitStreamState(ll_table, ll_state, &state_case, &state_nocase); + + DEBUG_PRINTF("loaded {%u, %u}\n", state_case, state_nocase); + + loadLongLiteralStateMode(scratch, ll_table, &ll_table->caseful, + state_case, 0); + loadLongLiteralStateMode(scratch, ll_table, &ll_table->nocase, + state_nocase, 1); +} + +static rose_inline +char confirmLongLiteral(const struct RoseLongLitTable *ll_table, + const struct hs_scratch *scratch, + const struct RoseLongLitHashEntry *ent, + const char nocase) { + assert(ent->str_offset + ent->str_len <= ll_table->size); + const u8 *s = (const u8 *)ll_table + ent->str_offset; + size_t len = ent->str_len; + const u8 *buf = scratch->core_info.buf; + const size_t buf_len = scratch->core_info.len; + + if (len > buf_len) { + const struct RoseContext *tctxt = &scratch->tctxt; + const u8 *hist = nocase ? tctxt->ll_buf_nocase : tctxt->ll_buf; + size_t hist_len = nocase ? tctxt->ll_len_nocase : tctxt->ll_len; + + if (len > buf_len + hist_len) { + return 0; // Break out - not enough total history + } + + size_t overhang = len - buf_len; + assert(overhang <= hist_len); + + if (cmpForward(hist + hist_len - overhang, s, overhang, nocase)) { + return 0; + } + s += overhang; + len -= overhang; + } + + // if we got here, we don't need history or we compared ok out of history + assert(len <= buf_len); + + if (cmpForward(buf + buf_len - len, s, len, nocase)) { + return 0; + } + + return 1; +} + +static rose_inline +const u8 *prepScanBuffer(const struct core_info *ci, + const struct RoseLongLitTable *ll_table, u8 *tempbuf) { + const u8 hash_len = ll_table->maxLen; + assert(hash_len >= LONG_LIT_HASH_LEN); + + // Our hash function operates over LONG_LIT_HASH_LEN bytes, starting from + // location (end of buffer - hash_len). If this block can be satisfied + // entirely from either the current buffer or the history buffer, we pass + // in the pointer directly; otherwise we must make a copy. + + const u8 *base; + + if (hash_len > ci->len) { + size_t overhang = hash_len - ci->len; + if (overhang >= LONG_LIT_HASH_LEN) { + // Can read enough to hash from inside the history buffer. + assert(overhang <= ci->hlen); + base = ci->hbuf + ci->hlen - overhang; + } else { + // Copy: first chunk from history buffer. + assert(overhang <= ci->hlen); + copy_upto_64_bytes(tempbuf, ci->hbuf + ci->hlen - overhang, + overhang); + // Copy: second chunk from current buffer. + size_t copy_buf_len = LONG_LIT_HASH_LEN - overhang; + assert(copy_buf_len <= ci->len); + copy_upto_64_bytes(tempbuf + overhang, ci->buf, copy_buf_len); + // Read from our temporary buffer for the hash. + base = tempbuf; + } + } else { + // Can read enough to hash from inside the current buffer. + base = ci->buf + ci->len - hash_len; + } + + return base; +} + +#ifndef NDEBUG +// Defensive checking (used in assert) that these table values don't overflow +// the range available. +static really_inline +char streamingTableOverflow(u32 state_case, u32 state_nocase, u8 ssb, + u8 ssb_nc) { + u32 ssb_mask = (1ULL << (ssb)) - 1; + if (state_case & ~ssb_mask) { + return 1; + } + u32 ssb_nc_mask = (1ULL << (ssb_nc)) - 1; + if (state_nocase & ~ssb_nc_mask) { + return 1; + } + return 0; +} +#endif + +// Reads from stream state table and packs values into stream state. +static rose_inline +void storeLongLitStreamState(const struct RoseLongLitTable *ll_table, + u8 *ll_state, u32 state_case, u32 state_nocase) { + assert(ll_table); + assert(ll_state); + + u8 ss_bytes = ll_table->streamStateBytes; + u8 ssb = ll_table->caseful.streamStateBits; + UNUSED u8 ssb_nc = ll_table->nocase.streamStateBits; + assert(ss_bytes == ROUNDUP_N(ssb + ssb_nc, 8) / 8); + assert(!streamingTableOverflow(state_case, state_nocase, ssb, ssb_nc)); + +#if defined(ARCH_32_BIT) + // On 32-bit hosts, we may be able to avoid having to do any u64a + // manipulation at all. + if (ss_bytes <= 4) { + u32 stagingStreamState = state_case; + stagingStreamState |= (state_nocase << ssb); + partial_store_u32(ll_state, stagingStreamState, ss_bytes); + return; + } +#endif + + u64a stagingStreamState = (u64a)state_case; + stagingStreamState |= (u64a)state_nocase << ssb; + partial_store_u64a(ll_state, stagingStreamState, ss_bytes); +} + +static really_inline +char has_bit(const u8 *data, u32 bit) { + return (data[bit / 8] >> (bit % 8)) & 1; +} + +static rose_inline +char bloomHasKey(const u8 *bloom, u32 bloom_mask, u32 hash) { + return has_bit(bloom, hash & bloom_mask); +} + +static rose_inline +char checkBloomFilter(const struct RoseLongLitTable *ll_table, + const struct RoseLongLitSubtable *ll_sub, + const u8 *scan_buf, char nocase) { + assert(ll_sub->bloomBits); + + const u8 *bloom = (const u8 *)ll_table + ll_sub->bloomOffset; + const u32 bloom_mask = (1U << ll_sub->bloomBits) - 1; + + char v = 1; + v &= bloomHasKey(bloom, bloom_mask, bloomHash_1(scan_buf, nocase)); + v &= bloomHasKey(bloom, bloom_mask, bloomHash_2(scan_buf, nocase)); + v &= bloomHasKey(bloom, bloom_mask, bloomHash_3(scan_buf, nocase)); + return v; +} + +/** + * \brief Look for a hit in the hash table. + * + * Returns zero if not found, otherwise returns (bucket + 1). + */ +static rose_inline +u32 checkHashTable(const struct RoseLongLitTable *ll_table, + const struct RoseLongLitSubtable *ll_sub, const u8 *scan_buf, + const struct hs_scratch *scratch, char nocase) { + const u32 nbits = ll_sub->hashBits; + assert(nbits && nbits < 32); + const u32 num_entries = 1U << nbits; + + const struct RoseLongLitHashEntry *tab = getHashTableBase(ll_table, ll_sub); + + u32 hash = hashLongLiteral(scan_buf, LONG_LIT_HASH_LEN, nocase); + u32 bucket = hash & ((1U << nbits) - 1); + + while (tab[bucket].str_offset != 0) { + DEBUG_PRINTF("checking bucket %u\n", bucket); + if (confirmLongLiteral(ll_table, scratch, &tab[bucket], nocase)) { + DEBUG_PRINTF("found hit for bucket %u\n", bucket); + return bucket + 1; + } + + if (++bucket == num_entries) { + bucket = 0; + } + } + + return 0; +} + +static rose_inline +void storeLongLiteralState(const struct RoseEngine *t, char *state, + struct hs_scratch *scratch) { + if (!t->longLitTableOffset) { + DEBUG_PRINTF("no table\n"); + return; + } + + struct core_info *ci = &scratch->core_info; + const struct RoseLongLitTable *ll_table = + getByOffset(t, t->longLitTableOffset); + assert(ll_table->maxLen); + + DEBUG_PRINTF("maxLen=%u, len=%zu, hlen=%zu\n", ll_table->maxLen, ci->len, + ci->hlen); + + u32 state_case = 0; + u32 state_nocase = 0; + + // If we don't have enough history, we don't need to do anything. + if (ll_table->maxLen <= ci->len + ci->hlen) { + u8 tempbuf[LONG_LIT_HASH_LEN]; + const u8 *scan_buf = prepScanBuffer(ci, ll_table, tempbuf); + + if (ll_table->caseful.hashBits && + checkBloomFilter(ll_table, &ll_table->caseful, scan_buf, 0)) { + state_case = checkHashTable(ll_table, &ll_table->caseful, scan_buf, + scratch, 0); + } + + if (ll_table->nocase.hashBits && + checkBloomFilter(ll_table, &ll_table->nocase, scan_buf, 1)) { + state_nocase = checkHashTable(ll_table, &ll_table->nocase, scan_buf, + scratch, 1); + } + } else { + DEBUG_PRINTF("not enough history (%zu bytes)\n", ci->len + ci->hlen); + } + + DEBUG_PRINTF("store {%u, %u}\n", state_case, state_nocase); + + u8 *ll_state = getLongLitState(t, state); + storeLongLitStreamState(ll_table, ll_state, state_case, state_nocase); +} + +#endif // STREAM_LONG_LIT_H diff --git a/regex/rose/stream_long_lit_hash.h b/regex/rose/stream_long_lit_hash.h new file mode 100644 index 000000000..041f05e60 --- /dev/null +++ b/regex/rose/stream_long_lit_hash.h @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef STREAM_LONG_LIT_HASH_H +#define STREAM_LONG_LIT_HASH_H + +#include "ue2common.h" +#include "util/bitutils.h" +#include "util/unaligned.h" + +/** \brief Length of the buffer operated on by \ref hashLongLiteral(). */ +#define LONG_LIT_HASH_LEN 24 + +/** \brief Multiplier used by al the hash functions below. */ +#define HASH_MULTIPLIER 0x0b4e0ef37bc32127ULL + +/** \brief Hash function used for long literal table in streaming mode. */ +static really_inline +u32 hashLongLiteral(const u8 *ptr, UNUSED size_t len, char nocase) { + // We unconditionally hash LONG_LIT_HASH_LEN bytes; all use cases of this + // hash are for strings longer than this. + assert(len >= 24); + + u64a v1 = unaligned_load_u64a(ptr); + u64a v2 = unaligned_load_u64a(ptr + 8); + u64a v3 = unaligned_load_u64a(ptr + 16); + if (nocase) { + v1 &= OCTO_CASE_CLEAR; + v2 &= OCTO_CASE_CLEAR; + v3 &= OCTO_CASE_CLEAR; + } + v1 *= HASH_MULTIPLIER; + v2 *= HASH_MULTIPLIER * HASH_MULTIPLIER; + v3 *= HASH_MULTIPLIER * HASH_MULTIPLIER * HASH_MULTIPLIER; + v1 >>= 32; + v2 >>= 32; + v3 >>= 32; + return v1 ^ v2 ^ v3; +} + +/** + * \brief Internal, used by the bloom filter hash functions below. Hashes 16 + * bytes beginning at (ptr + offset). + */ +static really_inline +u32 bloomHash_i(const u8 *ptr, u32 offset, u64a multiplier, char nocase) { + assert(offset + 16 <= LONG_LIT_HASH_LEN); + + u64a v = unaligned_load_u64a(ptr + offset); + if (nocase) { + v &= OCTO_CASE_CLEAR; + } + v *= multiplier; + return v >> 32; +} + +/* + * We ensure that we see every byte the first LONG_LIT_HASH_LEN bytes of input + * data (using at least one of the following functions). + */ + +static really_inline +u32 bloomHash_1(const u8 *ptr, char nocase) { + const u64a multiplier = HASH_MULTIPLIER; + return bloomHash_i(ptr, 0, multiplier, nocase); +} + +static really_inline +u32 bloomHash_2(const u8 *ptr, char nocase) { + const u64a multiplier = HASH_MULTIPLIER * HASH_MULTIPLIER; + return bloomHash_i(ptr, 4, multiplier, nocase); +} + +static really_inline +u32 bloomHash_3(const u8 *ptr, char nocase) { + const u64a multiplier = HASH_MULTIPLIER * HASH_MULTIPLIER * HASH_MULTIPLIER; + return bloomHash_i(ptr, 8, multiplier, nocase); +} + +#endif // STREAM_LONG_LIT_HASH_H diff --git a/regex/rose/validate_mask.h b/regex/rose/validate_mask.h new file mode 100644 index 000000000..8191db52f --- /dev/null +++ b/regex/rose/validate_mask.h @@ -0,0 +1,154 @@ +/* + * Copyright (c) 2016-2020, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef VALIDATE_MASK_H +#define VALIDATE_MASK_H + +#include "ue2common.h" +#include "util/simd_utils.h" + +#if defined(DEBUG) +static +void validateMask32Print(const u8 *mask) { + int i; + for (i = 0; i < 32; i++) { + printf("%02x", mask[i]); + } + printf("\n"); +} + +#ifdef HAVE_AVX512 +static +void validateMask64Print(const u8 *mask) { + int i; + for (i = 0; i < 64; i++) { + printf("%02x ", mask[i]); + } + printf("\n"); +} +#endif +#endif + +// check positive bytes in cmp_result. +// return one if the check passed, zero otherwise. +static really_inline +int posValidateMask(const u64a cmp_result, const u64a pos_mask) { + return !(cmp_result & pos_mask); +} + +/* + * check negative bytes in cmp_result. + * return one if any byte in cmp_result is not 0, zero otherwise. + * check lowest 7 bits and highest bit of every byte respectively. + */ +static really_inline +int negValidateMask(const u64a cmp_result, const u64a neg_mask) { + const u64a count_mask = 0x7f7f7f7f7f7f7f7f; + // check lowest 7 bits of every byte. + // the highest bit should be 1 if check passed. + u64a check_low = (cmp_result & count_mask) + count_mask; + // check the highest bit of every byte. + // combine the highest bit and 0x7f to 0xff if check passes. + // flip all 0xff to 0x00 and 0x7f to 0x80. + u64a check_all = ~(check_low | cmp_result | count_mask); + return !(check_all & neg_mask); +} + +static really_inline +int validateMask(u64a data, u64a valid_data_mask, u64a and_mask, + u64a cmp_mask, u64a neg_mask) { + // skip some byte where valid_data_mask is 0x00 there. + and_mask &= valid_data_mask; + cmp_mask &= valid_data_mask; + neg_mask &= valid_data_mask; + u64a cmp_result = (data & and_mask) ^ cmp_mask; + /* do the positive check first since it's cheaper */ + if (posValidateMask(cmp_result, ~neg_mask) + && negValidateMask(cmp_result, neg_mask)) { + return 1; + } else { + DEBUG_PRINTF("data %llx valid_data_mask(vdm) %llx\n", + data, valid_data_mask); + DEBUG_PRINTF("and_mask & vdm %llx cmp_mask & vdm %llx\n", and_mask, + cmp_mask); + DEBUG_PRINTF("cmp_result %llx neg_mask & vdm %llx\n", + cmp_result, neg_mask); + return 0; + } +} + +static really_inline +int validateMask32(const m256 data, const u32 valid_data_mask, + const m256 and_mask, const m256 cmp_mask, + const u32 neg_mask) { + m256 cmp_result_256 = eq256(and256(data, and_mask), cmp_mask); + u32 cmp_result = ~movemask256(cmp_result_256); +#ifdef DEBUG + DEBUG_PRINTF("data\n"); + validateMask32Print((const u8 *)&data); + DEBUG_PRINTF("cmp_result\n"); + validateMask32Print((const u8 *)&cmp_result_256); +#endif + DEBUG_PRINTF("cmp_result %08x neg_mask %08x\n", cmp_result, neg_mask); + DEBUG_PRINTF("valid_data_mask %08x\n", valid_data_mask); + + if ((cmp_result & valid_data_mask) == (neg_mask & valid_data_mask)) { + DEBUG_PRINTF("checkCompareResult32 passed\n"); + return 1; + } else { + DEBUG_PRINTF("checkCompareResult32 failed\n"); + return 0; + } +} + +#ifdef HAVE_AVX512 +static really_inline +int validateMask64(const m512 data, const u64a valid_data_mask, + const m512 and_mask, const m512 cmp_mask, + const u64a neg_mask) { + u64a cmp_result = ~eq512mask(and512(data, and_mask), cmp_mask); +#ifdef DEBUG + DEBUG_PRINTF("data\n"); + validateMask64Print((const u8 *)&data); + DEBUG_PRINTF("cmp_result\n"); + validateMask64Print((const u8 *)&cmp_result); +#endif + DEBUG_PRINTF("cmp_result %016llx neg_mask %016llx\n", cmp_result, neg_mask); + DEBUG_PRINTF("valid_data_mask %016llx\n", valid_data_mask); + + if ((cmp_result & valid_data_mask) == (neg_mask & valid_data_mask)) { + DEBUG_PRINTF("checkCompareResult64 passed\n"); + return 1; + } else { + DEBUG_PRINTF("checkCompareResult64 failed\n"); + return 0; + } +} +#endif + +#endif diff --git a/regex/rose/validate_shufti.h b/regex/rose/validate_shufti.h new file mode 100644 index 000000000..351df36a7 --- /dev/null +++ b/regex/rose/validate_shufti.h @@ -0,0 +1,372 @@ +/* + * Copyright (c) 2016-2020, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef VALIDATE_SHUFTI_H +#define VALIDATE_SHUFTI_H + +#include "ue2common.h" +#include "util/simd_utils.h" + +#if defined(DEBUG) +static +void dumpMask(const void *mask, int len) { + const u8 *c = (const u8 *)mask; + for (int i = 0; i < len; i++) { + printf("%02x", c[i]); + } + printf("\n"); +} +#endif + +static really_inline +int validateShuftiMask16x16(const m256 data, const m256 hi_mask, + const m256 lo_mask, const m256 and_mask, + const u32 neg_mask, const u32 valid_data_mask) { + m256 low4bits = set32x8(0xf); + m256 c_lo = pshufb_m256(lo_mask, and256(data, low4bits)); + m256 c_hi = pshufb_m256(hi_mask, + rshift64_m256(andnot256(low4bits, data), 4)); + m256 t = and256(c_lo, c_hi); + u32 nresult = movemask256(eq256(and256(t, and_mask), zeroes256())); +#ifdef DEBUG + DEBUG_PRINTF("data\n"); + dumpMask(&data, 32); + DEBUG_PRINTF("hi_mask\n"); + dumpMask(&hi_mask, 32); + DEBUG_PRINTF("lo_mask\n"); + dumpMask(&lo_mask, 32); + DEBUG_PRINTF("c_lo\n"); + dumpMask(&c_lo, 32); + DEBUG_PRINTF("c_hi\n"); + dumpMask(&c_hi, 32); + DEBUG_PRINTF("and_mask\n"); + dumpMask(&and_mask, 32); + DEBUG_PRINTF("nresult %x\n", nresult); + DEBUG_PRINTF("valid_data_mask %x\n", valid_data_mask); +#endif + u32 cmp_result = (((nresult >> 16) & nresult) ^ neg_mask) & valid_data_mask; + return !cmp_result; +} + +static really_inline +int validateShuftiMask16x8(const m128 data, const m256 nib_mask, + const m128 and_mask, const u32 neg_mask, + const u32 valid_data_mask) { + m256 data_m256 = combine2x128(rshift64_m128(data, 4), data); + m256 low4bits = set32x8(0xf); + m256 c_nib = pshufb_m256(nib_mask, and256(data_m256, low4bits)); + m128 t = and128(movdq_hi(c_nib), movdq_lo(c_nib)); + m128 nresult = eq128(and128(t, and_mask), zeroes128()); +#ifdef DEBUG + DEBUG_PRINTF("data\n"); + dumpMask(&data_m256, 32); + DEBUG_PRINTF("nib_mask\n"); + dumpMask(&nib_mask, 32); + DEBUG_PRINTF("c_nib\n"); + dumpMask(&c_nib, 32); + DEBUG_PRINTF("nresult\n"); + dumpMask(&nresult, 16); + DEBUG_PRINTF("valid_data_mask %x\n", valid_data_mask); +#endif + u32 cmp_result = (movemask128(nresult) ^ neg_mask) & valid_data_mask; + return !cmp_result; +} + +static really_inline +int validateShuftiMask32x8(const m256 data, const m256 hi_mask, + const m256 lo_mask, const m256 and_mask, + const u32 neg_mask, const u32 valid_data_mask) { + m256 low4bits = set32x8(0xf); + m256 c_lo = pshufb_m256(lo_mask, and256(data, low4bits)); + m256 c_hi = pshufb_m256(hi_mask, + rshift64_m256(andnot256(low4bits, data), 4)); + m256 t = and256(c_lo, c_hi); + m256 nresult = eq256(and256(t, and_mask), zeroes256()); +#ifdef DEBUG + DEBUG_PRINTF("data\n"); + dumpMask(&data, 32); + DEBUG_PRINTF("hi_mask\n"); + dumpMask(&hi_mask, 32); + DEBUG_PRINTF("lo_mask\n"); + dumpMask(&lo_mask, 32); + DEBUG_PRINTF("c_lo\n"); + dumpMask(&c_lo, 32); + DEBUG_PRINTF("c_hi\n"); + dumpMask(&c_hi, 32); + DEBUG_PRINTF("nresult\n"); + dumpMask(&nresult, 32); + DEBUG_PRINTF("valid_data_mask %x\n", valid_data_mask); +#endif + u32 cmp_result = (movemask256(nresult) ^ neg_mask) & valid_data_mask; + return !cmp_result; +} + +static really_inline +int validateShuftiMask32x16(const m256 data, + const m256 hi_mask_1, const m256 hi_mask_2, + const m256 lo_mask_1, const m256 lo_mask_2, + const m256 bucket_mask_hi, + const m256 bucket_mask_lo, const u32 neg_mask, + const u32 valid_data_mask) { + m256 low4bits = set32x8(0xf); + m256 data_lo = and256(data, low4bits); + m256 data_hi = and256(rshift64_m256(data, 4), low4bits); + m256 c_lo_1 = pshufb_m256(lo_mask_1, data_lo); + m256 c_lo_2 = pshufb_m256(lo_mask_2, data_lo); + m256 c_hi_1 = pshufb_m256(hi_mask_1, data_hi); + m256 c_hi_2 = pshufb_m256(hi_mask_2, data_hi); + m256 t1 = and256(c_lo_1, c_hi_1); + m256 t2 = and256(c_lo_2, c_hi_2); + m256 result = or256(and256(t1, bucket_mask_lo), and256(t2, bucket_mask_hi)); + u32 nresult = movemask256(eq256(result, zeroes256())); +#ifdef DEBUG + DEBUG_PRINTF("data\n"); + dumpMask(&data, 32); + DEBUG_PRINTF("data_lo\n"); + dumpMask(&data_lo, 32); + DEBUG_PRINTF("data_hi\n"); + dumpMask(&data_hi, 32); + DEBUG_PRINTF("hi_mask_1\n"); + dumpMask(&hi_mask_1, 16); + DEBUG_PRINTF("hi_mask_2\n"); + dumpMask(&hi_mask_2, 16); + DEBUG_PRINTF("lo_mask_1\n"); + dumpMask(&lo_mask_1, 16); + DEBUG_PRINTF("lo_mask_2\n"); + dumpMask(&lo_mask_2, 16); + DEBUG_PRINTF("c_lo_1\n"); + dumpMask(&c_lo_1, 32); + DEBUG_PRINTF("c_lo_2\n"); + dumpMask(&c_lo_2, 32); + DEBUG_PRINTF("c_hi_1\n"); + dumpMask(&c_hi_1, 32); + DEBUG_PRINTF("c_hi_2\n"); + dumpMask(&c_hi_2, 32); + DEBUG_PRINTF("result\n"); + dumpMask(&result, 32); + DEBUG_PRINTF("valid_data_mask %x\n", valid_data_mask); +#endif + u32 cmp_result = (nresult ^ neg_mask) & valid_data_mask; + return !cmp_result; +} + +#ifdef HAVE_AVX512 +static really_inline +int validateShuftiMask64x8(const m512 data, const m512 hi_mask, + const m512 lo_mask, const m512 and_mask, + const u64a neg_mask, const u64a valid_data_mask) { + m512 low4bits = set64x8(0xf); + m512 c_lo = pshufb_m512(lo_mask, and512(data, low4bits)); + m512 c_hi = pshufb_m512(hi_mask, + rshift64_m512(andnot512(low4bits, data), 4)); + m512 t = and512(c_lo, c_hi); + u64a nresult = eq512mask(and512(t, and_mask), zeroes512()); +#ifdef DEBUG + DEBUG_PRINTF("data\n"); + dumpMask(&data, 64); + DEBUG_PRINTF("hi_mask\n"); + dumpMask(&hi_mask, 64); + DEBUG_PRINTF("lo_mask\n"); + dumpMask(&lo_mask, 64); + DEBUG_PRINTF("c_lo\n"); + dumpMask(&c_lo, 64); + DEBUG_PRINTF("c_hi\n"); + dumpMask(&c_hi, 64); + DEBUG_PRINTF("nresult %llx\n", nresult); + DEBUG_PRINTF("valid_data_mask %llx\n", valid_data_mask); +#endif + u64a cmp_result = (nresult ^ neg_mask) & valid_data_mask; + return !cmp_result; +} + +static really_inline +int validateShuftiMask64x16(const m512 data, + const m512 hi_mask_1, const m512 hi_mask_2, + const m512 lo_mask_1, const m512 lo_mask_2, + const m512 and_mask_hi, const m512 and_mask_lo, + const u64a neg_mask, const u64a valid_data_mask) { + m512 low4bits = set64x8(0xf); + m512 data_lo = and512(data, low4bits); + m512 data_hi = and512(rshift64_m512(data, 4), low4bits); + m512 c_lo_1 = pshufb_m512(lo_mask_1, data_lo); + m512 c_lo_2 = pshufb_m512(lo_mask_2, data_lo); + m512 c_hi_1 = pshufb_m512(hi_mask_1, data_hi); + m512 c_hi_2 = pshufb_m512(hi_mask_2, data_hi); + m512 t1 = and512(c_lo_1, c_hi_1); + m512 t2 = and512(c_lo_2, c_hi_2); + m512 result = or512(and512(t1, and_mask_lo), and512(t2, and_mask_hi)); + u64a nresult = eq512mask(result, zeroes512()); +#ifdef DEBUG + DEBUG_PRINTF("data\n"); + dumpMask(&data, 64); + DEBUG_PRINTF("data_lo\n"); + dumpMask(&data_lo, 64); + DEBUG_PRINTF("data_hi\n"); + dumpMask(&data_hi, 64); + DEBUG_PRINTF("hi_mask_1\n"); + dumpMask(&hi_mask_1, 64); + DEBUG_PRINTF("hi_mask_2\n"); + dumpMask(&hi_mask_2, 64); + DEBUG_PRINTF("lo_mask_1\n"); + dumpMask(&lo_mask_1, 64); + DEBUG_PRINTF("lo_mask_2\n"); + dumpMask(&lo_mask_2, 64); + DEBUG_PRINTF("c_lo_1\n"); + dumpMask(&c_lo_1, 64); + DEBUG_PRINTF("c_lo_2\n"); + dumpMask(&c_lo_2, 64); + DEBUG_PRINTF("c_hi_1\n"); + dumpMask(&c_hi_1, 64); + DEBUG_PRINTF("c_hi_2\n"); + dumpMask(&c_hi_2, 64); + DEBUG_PRINTF("result\n"); + dumpMask(&result, 64); + DEBUG_PRINTF("valid_data_mask %llx\n", valid_data_mask); +#endif + u64a cmp_result = (nresult ^ neg_mask) & valid_data_mask; + return !cmp_result; +} +#endif + +static really_inline +int checkMultipath32(u32 data, u32 hi_bits, u32 lo_bits) { + u32 t = ~(data | hi_bits); + t += lo_bits; + t &= (~data) & hi_bits; + DEBUG_PRINTF("t %x\n", t); + return !!t; +} + +static really_inline +int checkMultipath64(u64a data, u64a hi_bits, u64a lo_bits) { + u64a t = ~(data | hi_bits); + t += lo_bits; + t &= (~data) & hi_bits; + DEBUG_PRINTF("t %llx\n", t); + return !!t; +} + +static really_inline +int validateMultipathShuftiMask16x8(const m128 data, + const m256 nib_mask, + const m128 bucket_select_mask, + const u32 hi_bits, const u32 lo_bits, + const u32 neg_mask, + const u32 valid_path_mask) { + m256 data_256 = combine2x128(rshift64_m128(data, 4), data); + m256 low4bits = set32x8(0xf); + m256 c_nib = pshufb_m256(nib_mask, and256(data_256, low4bits)); + m128 t = and128(movdq_hi(c_nib), movdq_lo(c_nib)); + m128 result = and128(t, bucket_select_mask); + u32 nresult = movemask128(eq128(result, zeroes128())); + u32 cmp_result = (nresult ^ neg_mask) | valid_path_mask; + + DEBUG_PRINTF("cmp_result %x\n", cmp_result); + + return checkMultipath32(cmp_result, hi_bits, lo_bits); +} + +static really_inline +int validateMultipathShuftiMask32x8(const m256 data, + const m256 hi_mask, const m256 lo_mask, + const m256 bucket_select_mask, + const u32 hi_bits, const u32 lo_bits, + const u32 neg_mask, + const u32 valid_path_mask) { + m256 low4bits = set32x8(0xf); + m256 data_lo = and256(data, low4bits); + m256 data_hi = and256(rshift64_m256(data, 4), low4bits); + m256 c_lo = pshufb_m256(lo_mask, data_lo); + m256 c_hi = pshufb_m256(hi_mask, data_hi); + m256 c = and256(c_lo, c_hi); + m256 result = and256(c, bucket_select_mask); + u32 nresult = movemask256(eq256(result, zeroes256())); + u32 cmp_result = (nresult ^ neg_mask) | valid_path_mask; + + DEBUG_PRINTF("cmp_result %x\n", cmp_result); + + return checkMultipath32(cmp_result, hi_bits, lo_bits); +} + +static really_inline +int validateMultipathShuftiMask32x16(const m256 data, + const m256 hi_mask_1, const m256 hi_mask_2, + const m256 lo_mask_1, const m256 lo_mask_2, + const m256 bucket_select_mask_hi, + const m256 bucket_select_mask_lo, + const u32 hi_bits, const u32 lo_bits, + const u32 neg_mask, + const u32 valid_path_mask) { + m256 low4bits = set32x8(0xf); + m256 data_lo = and256(data, low4bits); + m256 data_hi = and256(rshift64_m256(data, 4), low4bits); + m256 c_lo_1 = pshufb_m256(lo_mask_1, data_lo); + m256 c_lo_2 = pshufb_m256(lo_mask_2, data_lo); + m256 c_hi_1 = pshufb_m256(hi_mask_1, data_hi); + m256 c_hi_2 = pshufb_m256(hi_mask_2, data_hi); + m256 t1 = and256(c_lo_1, c_hi_1); + m256 t2 = and256(c_lo_2, c_hi_2); + m256 result = or256(and256(t1, bucket_select_mask_lo), + and256(t2, bucket_select_mask_hi)); + u32 nresult = movemask256(eq256(result, zeroes256())); + u32 cmp_result = (nresult ^ neg_mask) | valid_path_mask; + + DEBUG_PRINTF("cmp_result %x\n", cmp_result); + + return checkMultipath32(cmp_result, hi_bits, lo_bits); +} + +static really_inline +int validateMultipathShuftiMask64(const m256 data_1, const m256 data_2, + const m256 hi_mask, const m256 lo_mask, + const m256 bucket_select_mask_1, + const m256 bucket_select_mask_2, + const u64a hi_bits, const u64a lo_bits, + const u64a neg_mask, + const u64a valid_path_mask) { + m256 low4bits = set32x8(0xf); + m256 c_lo_1 = pshufb_m256(lo_mask, and256(data_1, low4bits)); + m256 c_lo_2 = pshufb_m256(lo_mask, and256(data_2, low4bits)); + m256 c_hi_1 = pshufb_m256(hi_mask, + rshift64_m256(andnot256(low4bits, data_1), 4)); + m256 c_hi_2 = pshufb_m256(hi_mask, + rshift64_m256(andnot256(low4bits, data_2), 4)); + m256 t1 = and256(c_lo_1, c_hi_1); + m256 t2 = and256(c_lo_2, c_hi_2); + m256 nresult_1 = eq256(and256(t1, bucket_select_mask_1), zeroes256()); + m256 nresult_2 = eq256(and256(t2, bucket_select_mask_2), zeroes256()); + u64a nresult = (u64a)movemask256(nresult_1) | + (u64a)movemask256(nresult_2) << 32; + u64a cmp_result = (nresult ^ neg_mask) | valid_path_mask; + + DEBUG_PRINTF("cmp_result %llx\n", cmp_result); + + return checkMultipath64(cmp_result, hi_bits, lo_bits); +} + +#endif diff --git a/regex/runtime.c b/regex/runtime.c new file mode 100644 index 000000000..b7c17320e --- /dev/null +++ b/regex/runtime.c @@ -0,0 +1,1356 @@ +/* + * Copyright (c) 2015-2019, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Runtime functions. + */ + +#ifndef __KERNEL__ +#include +#include +#else +#include +#include +#endif + +#include "allocator.h" +#include "hs_compile.h" /* for HS_MODE_* flags */ +#include "hs_runtime.h" +#include "hs_internal.h" +#include "hwlm/hwlm.h" +#include "nfa/mcclellan.h" +#include "nfa/nfa_api.h" +#include "nfa/nfa_api_util.h" +#include "nfa/nfa_internal.h" +#include "nfa/nfa_rev_api.h" +#include "nfa/sheng.h" +#include "smallwrite/smallwrite_internal.h" +#include "rose/rose.h" +#include "rose/runtime.h" +#include "database.h" +#include "report.h" +#include "scratch.h" +#include "som/som_runtime.h" +#include "som/som_stream.h" +#include "state.h" +#include "stream_compress.h" +#include "ue2common.h" +#include "util/exhaust.h" +#include "util/multibit.h" +#include "fw/str.h" + +static really_inline +void prefetch_data(const char *data, unsigned length) { + __builtin_prefetch(data); + __builtin_prefetch(data + length/2); + __builtin_prefetch(data + length - 24); +} + +/** dummy event handler for use when user does not provide one */ +static +int HS_CDECL null_onEvent(UNUSED unsigned id, UNUSED unsigned long long from, + UNUSED unsigned long long to, UNUSED unsigned flags, + UNUSED void *ctxt) { + return 0; +} + +static really_inline +u32 getHistoryAmount(const struct RoseEngine *t, u64a offset) { + return MIN(t->historyRequired, offset); +} + +static really_inline +u8 *getHistory(char *state, const struct RoseEngine *t, u64a offset) { + return (u8 *)state + t->stateOffsets.history + t->historyRequired + - MIN(t->historyRequired, offset); +} + +/** \brief Sanity checks for scratch space. + * + * Although more at home in scratch.c, it is located here to be closer to its + * callers. + */ +static really_inline +char validScratch(const struct RoseEngine *t, const struct hs_scratch *s) { + if (!ISALIGNED_CL(s)) { + DEBUG_PRINTF("bad alignment %p\n", s); + return 0; + } + + if (s->magic != SCRATCH_MAGIC) { + DEBUG_PRINTF("bad magic 0x%x\n", s->magic); + return 0; + } + + if (t->mode == HS_MODE_BLOCK && t->stateOffsets.end > s->bStateSize) { + DEBUG_PRINTF("bad state size\n"); + return 0; + } + + if (t->queueCount > s->queueCount) { + DEBUG_PRINTF("bad queue count\n"); + return 0; + } + + /* TODO: add quick rose sanity checks */ + + return 1; +} + +static really_inline +void populateCoreInfo(struct hs_scratch *s, const struct RoseEngine *rose, + char *state, match_event_handler onEvent, void *userCtx, + const char *data, size_t length, const u8 *history, + size_t hlen, u64a offset, u8 status, + UNUSED unsigned int flags) { + assert(rose); + s->core_info.userContext = userCtx; + s->core_info.userCallback = onEvent ? onEvent : null_onEvent; + s->core_info.rose = rose; + s->core_info.state = state; /* required for chained queues + evec */ + + s->core_info.exhaustionVector = state + rose->stateOffsets.exhausted; + s->core_info.status = status; + s->core_info.buf = (const u8 *)data; + s->core_info.len = length; + s->core_info.hbuf = history; + s->core_info.hlen = hlen; + s->core_info.buf_offset = offset; + + /* and some stuff not actually in core info */ + s->som_set_now_offset = ~0ULL; + s->deduper.current_report_offset = ~0ULL; + s->deduper.som_log_dirty = 1; /* som logs have not been cleared */ + s->fdr_conf = NULL; + + // Rose program execution (used for some report paths) depends on these + // values being initialised. + s->tctxt.lastMatchOffset = 0; + s->tctxt.minMatchOffset = offset; + s->tctxt.minNonMpvMatchOffset = offset; +} + +#define STATUS_VALID_BITS \ + (STATUS_TERMINATED | STATUS_EXHAUSTED | STATUS_DELAY_DIRTY | STATUS_ERROR) + +/** \brief Retrieve status bitmask from stream state. */ +static really_inline +u8 getStreamStatus(const char *state) { + u8 status = *(const u8 *)(state + ROSE_STATE_OFFSET_STATUS_FLAGS); + assert((status & ~STATUS_VALID_BITS) == 0); + return status; +} + +/** \brief Store status bitmask to stream state. */ +static really_inline +void setStreamStatus(char *state, u8 status) { + assert((status & ~STATUS_VALID_BITS) == 0); + *(u8 *)(state + ROSE_STATE_OFFSET_STATUS_FLAGS) = status; +} + +/** \brief Initialise SOM state. Used in both block and streaming mode. */ +static really_inline +void initSomState(const struct RoseEngine *rose, char *state) { + assert(rose && state); + const u32 somCount = rose->somLocationCount; + mmbit_clear((u8 *)state + rose->stateOffsets.somValid, somCount); + mmbit_clear((u8 *)state + rose->stateOffsets.somWritable, somCount); +} + +static really_inline +void rawBlockExec(const struct RoseEngine *rose, struct hs_scratch *scratch) { + assert(rose); + assert(scratch); + + initSomState(rose, scratch->core_info.state); + + DEBUG_PRINTF("blockmode scan len=%zu\n", scratch->core_info.len); + + roseBlockExec(rose, scratch); +} + +static really_inline +void pureLiteralInitScratch(struct hs_scratch *scratch, u64a offset) { + // Some init has already been done. + assert(offset == scratch->core_info.buf_offset); + + scratch->tctxt.lit_offset_adjust = offset + 1; + scratch->tctxt.lastEndOffset = offset; + scratch->tctxt.delayLastEndOffset = offset; + scratch->tctxt.filledDelayedSlots = 0; + scratch->al_log_sum = 0; +} + + +static really_inline +void pureLiteralBlockExec(const struct RoseEngine *rose, + struct hs_scratch *scratch) { + assert(rose); + assert(scratch); + + const struct HWLM *ftable = getFLiteralMatcher(rose); + + initSomState(rose, scratch->core_info.state); + const u8 *buffer = scratch->core_info.buf; + size_t length = scratch->core_info.len; + DEBUG_PRINTF("rose engine %d\n", rose->runtimeImpl); + + pureLiteralInitScratch(scratch, 0); + scratch->tctxt.groups = rose->initialGroups; + + hwlmExec(ftable, buffer, length, 0, roseCallback, scratch, + rose->initialGroups & rose->floating_group_mask); +} + +static really_inline +void initOutfixQueue(struct mq *q, u32 qi, const struct RoseEngine *t, + struct hs_scratch *scratch) { + const struct NfaInfo *info = getNfaInfoByQueue(t, qi); + q->nfa = getNfaByInfo(t, info); + q->end = 0; + q->cur = 0; + q->state = scratch->fullState + info->fullStateOffset; + q->streamState = (char *)scratch->core_info.state + info->stateOffset; + q->offset = scratch->core_info.buf_offset; + q->buffer = scratch->core_info.buf; + q->length = scratch->core_info.len; + q->history = scratch->core_info.hbuf; + q->hlength = scratch->core_info.hlen; + q->cb = roseReportAdaptor; + q->context = scratch; + q->report_current = 0; + + DEBUG_PRINTF("qi=%u, offset=%llu, fullState=%u, streamState=%u, " + "state=%u\n", qi, q->offset, info->fullStateOffset, + info->stateOffset, *(u32 *)q->state); +} + +static never_inline +void soleOutfixBlockExec(const struct RoseEngine *t, + struct hs_scratch *scratch) { + assert(t); + assert(scratch); + + initSomState(t, scratch->core_info.state); + assert(t->outfixEndQueue == 1); + assert(!t->amatcherOffset); + assert(!t->ematcherOffset); + assert(!t->fmatcherOffset); + + const struct NFA *nfa = getNfaByQueue(t, 0); + + size_t len = nfaRevAccelCheck(nfa, scratch->core_info.buf, + scratch->core_info.len); + if (!len) { + return; + } + + struct mq *q = scratch->queues; + initOutfixQueue(q, 0, t, scratch); + q->length = len; /* adjust for rev_accel */ + nfaQueueInitState(nfa, q); + pushQueueAt(q, 0, MQE_START, 0); + pushQueueAt(q, 1, MQE_TOP, 0); + pushQueueAt(q, 2, MQE_END, scratch->core_info.len); + + char rv = nfaQueueExec(q->nfa, q, scratch->core_info.len); + + if (rv && nfaAcceptsEod(nfa) && len == scratch->core_info.len) { + nfaCheckFinalState(nfa, q->state, q->streamState, q->length, q->cb, + scratch); + } +} + +static rose_inline +void runSmallWriteEngine(const struct SmallWriteEngine *smwr, + struct hs_scratch *scratch) { + assert(smwr); + assert(scratch); + + const u8 *buffer = scratch->core_info.buf; + size_t length = scratch->core_info.len; + + DEBUG_PRINTF("USING SMALL WRITE\n"); + + if (length <= smwr->start_offset) { + DEBUG_PRINTF("too short\n"); + return; + } + + const struct NFA *nfa = getSmwrNfa(smwr); + + size_t local_alen = length - smwr->start_offset; + const u8 *local_buffer = buffer + smwr->start_offset; + + assert(isDfaType(nfa->type)); + if (nfa->type == MCCLELLAN_NFA_8) { + nfaExecMcClellan8_B(nfa, smwr->start_offset, local_buffer, + local_alen, roseReportAdaptor, scratch); + } else if (nfa->type == MCCLELLAN_NFA_16) { + nfaExecMcClellan16_B(nfa, smwr->start_offset, local_buffer, + local_alen, roseReportAdaptor, scratch); + } else { + nfaExecSheng_B(nfa, smwr->start_offset, local_buffer, + local_alen, roseReportAdaptor, scratch); + } +} + +HS_PUBLIC_API +hs_error_t HS_CDECL hs_scan(const hs_database_t *db, const char *data, + unsigned length, unsigned flags, + hs_scratch_t *scratch, match_event_handler onEvent, + void *userCtx) { + if (unlikely(!scratch || !data)) { + return HS_INVALID; + } + + hs_error_t err = validDatabase(db); + if (unlikely(err != HS_SUCCESS)) { + return err; + } + + const struct RoseEngine *rose = hs_get_bytecode(db); + if (unlikely(!ISALIGNED_16(rose))) { + return HS_INVALID; + } + + if (unlikely(rose->mode != HS_MODE_BLOCK)) { + return HS_DB_MODE_ERROR; + } + + if (unlikely(!validScratch(rose, scratch))) { + return HS_INVALID; + } + + if (unlikely(markScratchInUse(scratch))) { + return HS_SCRATCH_IN_USE; + } + + if (rose->minWidth > length) { + DEBUG_PRINTF("minwidth=%u > length=%u\n", rose->minWidth, length); + unmarkScratchInUse(scratch); + return HS_SUCCESS; + } + + prefetch_data(data, length); + + /* populate core info in scratch */ + populateCoreInfo(scratch, rose, scratch->bstate, onEvent, userCtx, data, + length, NULL, 0, 0, 0, flags); + + clearEvec(rose, scratch->core_info.exhaustionVector); + if (rose->ckeyCount) { + scratch->core_info.logicalVector = scratch->bstate + + rose->stateOffsets.logicalVec; + scratch->core_info.combVector = scratch->bstate + + rose->stateOffsets.combVec; + scratch->tctxt.lastCombMatchOffset = 0; + clearLvec(rose, scratch->core_info.logicalVector, + scratch->core_info.combVector); + } + + if (!length) { + if (rose->boundary.reportZeroEodOffset) { + roseRunBoundaryProgram(rose, rose->boundary.reportZeroEodOffset, 0, + scratch); + } + goto set_retval; + } + + if (rose->boundary.reportZeroOffset) { + int rv = roseRunBoundaryProgram(rose, rose->boundary.reportZeroOffset, + 0, scratch); + if (rv == MO_HALT_MATCHING) { + goto set_retval; + } + } + + if (rose->minWidthExcludingBoundaries > length) { + DEBUG_PRINTF("minWidthExcludingBoundaries=%u > length=%u\n", + rose->minWidthExcludingBoundaries, length); + goto done_scan; + } + + // Similarly, we may have a maximum width (for engines constructed entirely + // of bi-anchored patterns). + if (rose->maxBiAnchoredWidth != ROSE_BOUND_INF + && length > rose->maxBiAnchoredWidth) { + DEBUG_PRINTF("block len=%u longer than maxBAWidth=%u\n", length, + rose->maxBiAnchoredWidth); + goto done_scan; + } + + // Is this a small write case? + if (rose->smallWriteOffset) { + const struct SmallWriteEngine *smwr = getSmallWrite(rose); + assert(smwr); + + // Apply the small write engine if and only if the block (buffer) is + // small enough. Otherwise, we allow rose &co to deal with it. + if (length < smwr->largestBuffer) { + DEBUG_PRINTF("Attempting small write of block %u bytes long.\n", + length); + runSmallWriteEngine(smwr, scratch); + goto done_scan; + } + } + + switch (rose->runtimeImpl) { + case ROSE_RUNTIME_FULL_ROSE: + rawBlockExec(rose, scratch); + break; + case ROSE_RUNTIME_PURE_LITERAL: + pureLiteralBlockExec(rose, scratch); + break; + case ROSE_RUNTIME_SINGLE_OUTFIX: + soleOutfixBlockExec(rose, scratch); + break; + default: + assert(0); + } + +done_scan: + if (unlikely(internal_matching_error(scratch))) { + unmarkScratchInUse(scratch); + return HS_UNKNOWN_ERROR; + } else if (told_to_stop_matching(scratch)) { + unmarkScratchInUse(scratch); + return HS_SCAN_TERMINATED; + } + + if (rose->hasSom) { + int halt = flushStoredSomMatches(scratch, ~0ULL); + if (halt) { + unmarkScratchInUse(scratch); + return HS_SCAN_TERMINATED; + } + } + + if (rose->boundary.reportEodOffset) { + roseRunBoundaryProgram(rose, rose->boundary.reportEodOffset, length, + scratch); + } + +set_retval: + if (unlikely(internal_matching_error(scratch))) { + unmarkScratchInUse(scratch); + return HS_UNKNOWN_ERROR; + } + + if (rose->lastFlushCombProgramOffset) { + if (roseRunLastFlushCombProgram(rose, scratch, length) + == MO_HALT_MATCHING) { + if (unlikely(internal_matching_error(scratch))) { + unmarkScratchInUse(scratch); + return HS_UNKNOWN_ERROR; + } + unmarkScratchInUse(scratch); + return HS_SCAN_TERMINATED; + } + } + + DEBUG_PRINTF("done. told_to_stop_matching=%d\n", + told_to_stop_matching(scratch)); + hs_error_t rv = told_to_stop_matching(scratch) ? HS_SCAN_TERMINATED + : HS_SUCCESS; + unmarkScratchInUse(scratch); + return rv; +} + +static really_inline +void maintainHistoryBuffer(const struct RoseEngine *rose, char *state, + const char *buffer, size_t length) { + if (!rose->historyRequired) { + return; + } + + // Hopefully few of our users are scanning no data. + if (unlikely(length == 0)) { + DEBUG_PRINTF("zero-byte scan\n"); + return; + } + + char *his_state = state + rose->stateOffsets.history; + + if (length < rose->historyRequired) { + size_t shortfall = rose->historyRequired - length; + memmove(his_state, his_state + rose->historyRequired - shortfall, + shortfall); + } + size_t amount = MIN(rose->historyRequired, length); + + memcpy(his_state + rose->historyRequired - amount, buffer + length - amount, + amount); +#ifdef DEBUG_HISTORY + printf("History [%u] : ", rose->historyRequired); + for (size_t i = 0; i < rose->historyRequired; i++) { + printf(" %02hhx", his_state[i]); + } + printf("\n"); +#endif +} + +static really_inline +void init_stream(struct hs_stream *s, const struct RoseEngine *rose, + char init_history) { + char *state = getMultiState(s); + + if (init_history) { + // Make absolutely sure that the 16 bytes leading up to the end of the + // history buffer are initialised, as we rely on this (regardless of the + // actual values used) in FDR. + char *hist_end = + state + rose->stateOffsets.history + rose->historyRequired; + assert(hist_end - 16 >= (const char *)s); + memset(hist_end - 16, 0x5a, 16); + } + + s->rose = rose; + s->offset = 0; + + setStreamStatus(state, 0); + roseInitState(rose, state); + + clearEvec(rose, state + rose->stateOffsets.exhausted); + if (rose->ckeyCount) { + clearLvec(rose, state + rose->stateOffsets.logicalVec, + state + rose->stateOffsets.combVec); + } + + // SOM state multibit structures. + initSomState(rose, state); +} + +HS_PUBLIC_API +hs_error_t HS_CDECL hs_open_stream(const hs_database_t *db, + UNUSED unsigned flags, + hs_stream_t **stream) { + if (unlikely(!stream)) { + return HS_INVALID; + } + + *stream = NULL; + + hs_error_t err = validDatabase(db); + if (unlikely(err != HS_SUCCESS)) { + return err; + } + + const struct RoseEngine *rose = hs_get_bytecode(db); + if (unlikely(!ISALIGNED_16(rose))) { + return HS_INVALID; + } + + if (unlikely(rose->mode != HS_MODE_STREAM)) { + return HS_DB_MODE_ERROR; + } + + size_t stateSize = rose->stateOffsets.end; + struct hs_stream *s = hs_stream_alloc(sizeof(struct hs_stream) + stateSize); + if (unlikely(!s)) { + return HS_NOMEM; + } + + init_stream(s, rose, 1); + + *stream = s; + return HS_SUCCESS; +} + + +static really_inline +void rawEodExec(hs_stream_t *id, hs_scratch_t *scratch) { + const struct RoseEngine *rose = id->rose; + + if (can_stop_matching(scratch)) { + DEBUG_PRINTF("stream already broken\n"); + return; + } + + if (isAllExhausted(rose, scratch->core_info.exhaustionVector)) { + DEBUG_PRINTF("stream exhausted\n"); + return; + } + + roseStreamEodExec(rose, id->offset, scratch); +} + +static never_inline +void soleOutfixEodExec(hs_stream_t *id, hs_scratch_t *scratch) { + const struct RoseEngine *t = id->rose; + + if (can_stop_matching(scratch)) { + DEBUG_PRINTF("stream already broken\n"); + return; + } + + if (isAllExhausted(t, scratch->core_info.exhaustionVector)) { + DEBUG_PRINTF("stream exhausted\n"); + return; + } + + assert(t->outfixEndQueue == 1); + assert(!t->amatcherOffset); + assert(!t->ematcherOffset); + assert(!t->fmatcherOffset); + + const struct NFA *nfa = getNfaByQueue(t, 0); + + struct mq *q = scratch->queues; + initOutfixQueue(q, 0, t, scratch); + if (!scratch->core_info.buf_offset) { + DEBUG_PRINTF("buf_offset is zero\n"); + return; /* no vacuous engines */ + } + + nfaExpandState(nfa, q->state, q->streamState, q->offset, + queue_prev_byte(q, 0)); + + assert(nfaAcceptsEod(nfa)); + nfaCheckFinalState(nfa, q->state, q->streamState, q->offset, q->cb, + scratch); +} + +static really_inline +void report_eod_matches(hs_stream_t *id, hs_scratch_t *scratch, + match_event_handler onEvent, void *context) { + DEBUG_PRINTF("--- report eod matches at offset %llu\n", id->offset); + assert(onEvent); + + const struct RoseEngine *rose = id->rose; + char *state = getMultiState(id); + u8 status = getStreamStatus(state); + + if (status & (STATUS_TERMINATED | STATUS_EXHAUSTED | STATUS_ERROR)) { + DEBUG_PRINTF("stream is broken, just freeing storage\n"); + return; + } + + populateCoreInfo(scratch, rose, state, onEvent, context, NULL, 0, + getHistory(state, rose, id->offset), + getHistoryAmount(rose, id->offset), id->offset, status, 0); + + if (rose->ckeyCount) { + scratch->core_info.logicalVector = state + + rose->stateOffsets.logicalVec; + scratch->core_info.combVector = state + rose->stateOffsets.combVec; + if (!id->offset) { + scratch->tctxt.lastCombMatchOffset = id->offset; + } + } + + if (rose->somLocationCount) { + loadSomFromStream(scratch, id->offset); + } + + if (!id->offset) { + if (rose->boundary.reportZeroEodOffset) { + int rv = roseRunBoundaryProgram( + rose, rose->boundary.reportZeroEodOffset, 0, scratch); + if (rv == MO_HALT_MATCHING) { + return; + } + } + } else { + if (rose->boundary.reportEodOffset) { + int rv = roseRunBoundaryProgram( + rose, rose->boundary.reportEodOffset, id->offset, scratch); + if (rv == MO_HALT_MATCHING) { + return; + } + } + + if (rose->requiresEodCheck) { + switch (rose->runtimeImpl) { + default: + case ROSE_RUNTIME_FULL_ROSE: + rawEodExec(id, scratch); + break; + case ROSE_RUNTIME_SINGLE_OUTFIX: + soleOutfixEodExec(id, scratch); + break; + case ROSE_RUNTIME_PURE_LITERAL: + assert(0); + } + } + } + + if (rose->hasSom && !told_to_stop_matching(scratch)) { + int halt = flushStoredSomMatches(scratch, ~0ULL); + if (halt) { + DEBUG_PRINTF("told to stop matching\n"); + scratch->core_info.status |= STATUS_TERMINATED; + } + } + + if (rose->lastFlushCombProgramOffset && !told_to_stop_matching(scratch)) { + if (roseRunLastFlushCombProgram(rose, scratch, id->offset) + == MO_HALT_MATCHING) { + DEBUG_PRINTF("told to stop matching\n"); + scratch->core_info.status |= STATUS_TERMINATED; + } + } +} + +HS_PUBLIC_API +hs_error_t HS_CDECL hs_copy_stream(hs_stream_t **to_id, + const hs_stream_t *from_id) { + if (!to_id) { + return HS_INVALID; + } + + *to_id = NULL; + + if (!from_id || !from_id->rose) { + return HS_INVALID; + } + + const struct RoseEngine *rose = from_id->rose; + size_t stateSize = sizeof(struct hs_stream) + rose->stateOffsets.end; + + struct hs_stream *s = hs_stream_alloc(stateSize); + if (!s) { + return HS_NOMEM; + } + + memcpy(s, from_id, stateSize); + + *to_id = s; + + return HS_SUCCESS; +} + +HS_PUBLIC_API +hs_error_t HS_CDECL hs_reset_and_copy_stream(hs_stream_t *to_id, + const hs_stream_t *from_id, + hs_scratch_t *scratch, + match_event_handler onEvent, + void *context) { + if (!from_id || !from_id->rose) { + return HS_INVALID; + } + + if (!to_id || to_id->rose != from_id->rose) { + return HS_INVALID; + } + + if (to_id == from_id) { + return HS_INVALID; + } + + if (onEvent) { + if (!scratch || !validScratch(to_id->rose, scratch)) { + return HS_INVALID; + } + if (unlikely(markScratchInUse(scratch))) { + return HS_SCRATCH_IN_USE; + } + report_eod_matches(to_id, scratch, onEvent, context); + if (unlikely(internal_matching_error(scratch))) { + unmarkScratchInUse(scratch); + return HS_UNKNOWN_ERROR; + } + unmarkScratchInUse(scratch); + } + + size_t stateSize + = sizeof(struct hs_stream) + from_id->rose->stateOffsets.end; + + memcpy(to_id, from_id, stateSize); + + return HS_SUCCESS; +} + +static really_inline +void rawStreamExec(struct hs_stream *stream_state, struct hs_scratch *scratch) { + assert(stream_state); + assert(scratch); + assert(!can_stop_matching(scratch)); + + DEBUG_PRINTF("::: streaming rose ::: offset = %llu len = %zu\n", + stream_state->offset, scratch->core_info.len); + + const struct RoseEngine *rose = stream_state->rose; + assert(rose); + roseStreamExec(rose, scratch); + + if (!told_to_stop_matching(scratch) && + isAllExhausted(rose, scratch->core_info.exhaustionVector)) { + DEBUG_PRINTF("stream exhausted\n"); + scratch->core_info.status |= STATUS_EXHAUSTED; + } +} + +static really_inline +void pureLiteralStreamExec(struct hs_stream *stream_state, + struct hs_scratch *scratch) { + assert(stream_state); + assert(scratch); + assert(!can_stop_matching(scratch)); + + const struct RoseEngine *rose = stream_state->rose; + const struct HWLM *ftable = getFLiteralMatcher(rose); + + size_t len2 = scratch->core_info.len; + + DEBUG_PRINTF("::: streaming rose ::: offset = %llu len = %zu\n", + stream_state->offset, scratch->core_info.len); + + pureLiteralInitScratch(scratch, stream_state->offset); + scratch->tctxt.groups = loadGroups(rose, scratch->core_info.state); + + // Pure literal cases don't have floatingMinDistance set, so we always + // start the match region at zero. + const size_t start = 0; + + hwlmExecStreaming(ftable, len2, start, roseCallback, scratch, + rose->initialGroups & rose->floating_group_mask); + + if (!told_to_stop_matching(scratch) && + isAllExhausted(rose, scratch->core_info.exhaustionVector)) { + DEBUG_PRINTF("stream exhausted\n"); + scratch->core_info.status |= STATUS_EXHAUSTED; + } +} + +static never_inline +void soleOutfixStreamExec(struct hs_stream *stream_state, + struct hs_scratch *scratch) { + assert(stream_state); + assert(scratch); + assert(!can_stop_matching(scratch)); + + const struct RoseEngine *t = stream_state->rose; + assert(t->outfixEndQueue == 1); + assert(!t->amatcherOffset); + assert(!t->ematcherOffset); + assert(!t->fmatcherOffset); + + const struct NFA *nfa = getNfaByQueue(t, 0); + + struct mq *q = scratch->queues; + initOutfixQueue(q, 0, t, scratch); + if (!scratch->core_info.buf_offset) { + nfaQueueInitState(nfa, q); + pushQueueAt(q, 0, MQE_START, 0); + pushQueueAt(q, 1, MQE_TOP, 0); + pushQueueAt(q, 2, MQE_END, scratch->core_info.len); + } else { + nfaExpandState(nfa, q->state, q->streamState, q->offset, + queue_prev_byte(q, 0)); + pushQueueAt(q, 0, MQE_START, 0); + pushQueueAt(q, 1, MQE_END, scratch->core_info.len); + } + + if (nfaQueueExec(q->nfa, q, scratch->core_info.len)) { + nfaQueueCompressState(nfa, q, scratch->core_info.len); + } else if (!told_to_stop_matching(scratch)) { + scratch->core_info.status |= STATUS_EXHAUSTED; + } +} + +static inline +hs_error_t hs_scan_stream_internal(hs_stream_t *id, const char *data, + unsigned length, UNUSED unsigned flags, + hs_scratch_t *scratch, + match_event_handler onEvent, void *context) { + assert(id); + assert(scratch); + + if (unlikely(!data)) { + return HS_INVALID; + } + + const struct RoseEngine *rose = id->rose; + char *state = getMultiState(id); + + u8 status = getStreamStatus(state); + if (status & (STATUS_TERMINATED | STATUS_EXHAUSTED | STATUS_ERROR)) { + DEBUG_PRINTF("stream is broken, halting scan\n"); + if (status & STATUS_ERROR) { + return HS_UNKNOWN_ERROR; + } else if (status & STATUS_TERMINATED) { + return HS_SCAN_TERMINATED; + } else { + return HS_SUCCESS; + } + } + + // We avoid doing any work if the user has given us zero bytes of data to + // scan. Arguably we should define some semantics for how we treat vacuous + // cases here. + if (unlikely(length == 0)) { + DEBUG_PRINTF("zero length block\n"); + return HS_SUCCESS; + } + + u32 historyAmount = getHistoryAmount(rose, id->offset); + populateCoreInfo(scratch, rose, state, onEvent, context, data, length, + getHistory(state, rose, id->offset), historyAmount, + id->offset, status, flags); + if (rose->ckeyCount) { + scratch->core_info.logicalVector = state + + rose->stateOffsets.logicalVec; + scratch->core_info.combVector = state + rose->stateOffsets.combVec; + if (!id->offset) { + scratch->tctxt.lastCombMatchOffset = id->offset; + } + } + assert(scratch->core_info.hlen <= id->offset + && scratch->core_info.hlen <= rose->historyRequired); + + prefetch_data(data, length); + + if (rose->somLocationCount) { + loadSomFromStream(scratch, id->offset); + } + + if (!id->offset && rose->boundary.reportZeroOffset) { + DEBUG_PRINTF("zero reports\n"); + int rv = roseRunBoundaryProgram(rose, rose->boundary.reportZeroOffset, + 0, scratch); + if (rv == MO_HALT_MATCHING) { + DEBUG_PRINTF("halting scan\n"); + setStreamStatus(state, scratch->core_info.status); + if (told_to_stop_matching(scratch)) { + return HS_SCAN_TERMINATED; + } else { + assert(scratch->core_info.status & STATUS_EXHAUSTED); + return HS_SUCCESS; + } + } + } + + switch (rose->runtimeImpl) { + case ROSE_RUNTIME_FULL_ROSE: + rawStreamExec(id, scratch); + break; + case ROSE_RUNTIME_PURE_LITERAL: + pureLiteralStreamExec(id, scratch); + break; + case ROSE_RUNTIME_SINGLE_OUTFIX: + soleOutfixStreamExec(id, scratch); + break; + default: + assert(0); + } + + if (rose->hasSom && !told_to_stop_matching(scratch)) { + int halt = flushStoredSomMatches(scratch, ~0ULL); + if (halt) { + scratch->core_info.status |= STATUS_TERMINATED; + } + } + + setStreamStatus(state, scratch->core_info.status); + + if (unlikely(internal_matching_error(scratch))) { + return HS_UNKNOWN_ERROR; + } else if (likely(!can_stop_matching(scratch))) { + maintainHistoryBuffer(rose, state, data, length); + id->offset += length; /* maintain offset */ + + if (rose->somLocationCount) { + storeSomToStream(scratch, id->offset); + } + } else if (told_to_stop_matching(scratch)) { + return HS_SCAN_TERMINATED; + } + + return HS_SUCCESS; +} + +HS_PUBLIC_API +hs_error_t HS_CDECL hs_scan_stream(hs_stream_t *id, const char *data, + unsigned length, unsigned flags, + hs_scratch_t *scratch, + match_event_handler onEvent, void *context) { + if (unlikely(!id || !scratch || !data || + !validScratch(id->rose, scratch))) { + return HS_INVALID; + } + + if (unlikely(markScratchInUse(scratch))) { + return HS_SCRATCH_IN_USE; + } + hs_error_t rv = hs_scan_stream_internal(id, data, length, flags, scratch, + onEvent, context); + unmarkScratchInUse(scratch); + return rv; +} + +HS_PUBLIC_API +hs_error_t HS_CDECL hs_close_stream(hs_stream_t *id, hs_scratch_t *scratch, + match_event_handler onEvent, + void *context) { + if (!id) { + return HS_INVALID; + } + + if (onEvent) { + if (!scratch || !validScratch(id->rose, scratch)) { + return HS_INVALID; + } + if (unlikely(markScratchInUse(scratch))) { + return HS_SCRATCH_IN_USE; + } + report_eod_matches(id, scratch, onEvent, context); + if (unlikely(internal_matching_error(scratch))) { + unmarkScratchInUse(scratch); + return HS_UNKNOWN_ERROR; + } + unmarkScratchInUse(scratch); + } + + hs_stream_free(id); + + return HS_SUCCESS; +} + +HS_PUBLIC_API +hs_error_t HS_CDECL hs_reset_stream(hs_stream_t *id, UNUSED unsigned int flags, + hs_scratch_t *scratch, + match_event_handler onEvent, + void *context) { + if (!id) { + return HS_INVALID; + } + + if (onEvent) { + if (!scratch || !validScratch(id->rose, scratch)) { + return HS_INVALID; + } + if (unlikely(markScratchInUse(scratch))) { + return HS_SCRATCH_IN_USE; + } + report_eod_matches(id, scratch, onEvent, context); + if (unlikely(internal_matching_error(scratch))) { + unmarkScratchInUse(scratch); + return HS_UNKNOWN_ERROR; + } + unmarkScratchInUse(scratch); + } + + // history already initialised + init_stream(id, id->rose, 0); + + return HS_SUCCESS; +} + +HS_PUBLIC_API +hs_error_t HS_CDECL hs_stream_size(const hs_database_t *db, + size_t *stream_size) { + if (!stream_size) { + return HS_INVALID; + } + + hs_error_t ret = validDatabase(db); + if (ret != HS_SUCCESS) { + return ret; + } + + const struct RoseEngine *rose = hs_get_bytecode(db); + if (!ISALIGNED_16(rose)) { + return HS_INVALID; + } + + if (rose->mode != HS_MODE_STREAM) { + return HS_DB_MODE_ERROR; + } + + u32 base_stream_size = rose->stateOffsets.end; + + // stream state plus the hs_stream struct itself + *stream_size = base_stream_size + sizeof(struct hs_stream); + + return HS_SUCCESS; +} + +#if defined(DEBUG) || defined(DUMP_SUPPORT) +#include "util/compare.h" +// A debugging crutch: print a hex-escaped version of the match for our +// perusal. +static UNUSED +void dumpData(const char *data, size_t len) { + DEBUG_PRINTF("BUFFER:"); + for (size_t i = 0; i < len; i++) { + u8 c = data[i]; + if (ourisprint(c) && c != '\'') { + DEBUG_PRINTF("%c", c); + } else { + DEBUG_PRINTF("\\x%02x", c); + } + } + DEBUG_PRINTF("\n"); +} +#endif + +HS_PUBLIC_API +hs_error_t HS_CDECL hs_scan_vector(const hs_database_t *db, + const char * const * data, + const unsigned int *length, + unsigned int count, + UNUSED unsigned int flags, + hs_scratch_t *scratch, + match_event_handler onEvent, void *context) { + if (unlikely(!scratch || !data || !length)) { + return HS_INVALID; + } + + hs_error_t err = validDatabase(db); + if (unlikely(err != HS_SUCCESS)) { + return err; + } + + const struct RoseEngine *rose = hs_get_bytecode(db); + if (unlikely(!ISALIGNED_16(rose))) { + return HS_INVALID; + } + + if (unlikely(rose->mode != HS_MODE_VECTORED)) { + return HS_DB_MODE_ERROR; + } + + if (unlikely(!validScratch(rose, scratch))) { + return HS_INVALID; + } + + if (unlikely(markScratchInUse(scratch))) { + return HS_SCRATCH_IN_USE; + } + + hs_stream_t *id = (hs_stream_t *)(scratch->bstate); + + init_stream(id, rose, 1); /* open stream */ + + for (u32 i = 0; i < count; i++) { + DEBUG_PRINTF("block %u/%u offset=%llu len=%u\n", i, count, id->offset, + length[i]); +#ifdef DEBUG + dumpData(data[i], length[i]); +#endif + hs_error_t ret + = hs_scan_stream_internal(id, data[i], length[i], 0, scratch, + onEvent, context); + if (ret != HS_SUCCESS) { + unmarkScratchInUse(scratch); + return ret; + } + } + + /* close stream */ + if (onEvent) { + report_eod_matches(id, scratch, onEvent, context); + + if (unlikely(internal_matching_error(scratch))) { + unmarkScratchInUse(scratch); + return HS_UNKNOWN_ERROR; + } else if (told_to_stop_matching(scratch)) { + unmarkScratchInUse(scratch); + return HS_SCAN_TERMINATED; + } + } + + unmarkScratchInUse(scratch); + + return HS_SUCCESS; +} + +HS_PUBLIC_API +hs_error_t HS_CDECL hs_scan_tfwstr(const hs_database_t *db, + const void *data,/*TfwStr*/ + UNUSED unsigned int flags, + hs_scratch_t *scratch, + match_event_handler onEvent, void *context) { + + const TfwStr *chunk, *end, *str; + + if (unlikely(!scratch || !data )) { + return HS_INVALID; + } + + str = (const TfwStr *)data; + + hs_error_t err = validDatabase(db); + if (unlikely(err != HS_SUCCESS)) { + return err; + } + + const struct RoseEngine *rose = hs_get_bytecode(db); + if (unlikely(!ISALIGNED_16(rose))) { + return HS_INVALID; + } + + if (unlikely(rose->mode != HS_MODE_VECTORED)) { + return HS_DB_MODE_ERROR; + } + + if (unlikely(!validScratch(rose, scratch))) { + return HS_INVALID; + } + + if (unlikely(markScratchInUse(scratch))) { + return HS_SCRATCH_IN_USE; + } + + hs_stream_t *id = (hs_stream_t *)(scratch->bstate); + + init_stream(id, rose, 1); /* open stream */ + + + TFW_STR_FOR_EACH_CHUNK(chunk, str, end) { + DEBUG_PRINTF("offset=%llu len=%lu\n", id->offset, chunk->len); +#ifdef DEBUG + dumpData(chunk->data, chunk->len); +#endif + hs_error_t ret + = hs_scan_stream_internal(id, chunk->data, chunk->len, 0, scratch, + onEvent, context); + if (ret != HS_SUCCESS) { + unmarkScratchInUse(scratch); + return ret; + } + } + + /* close stream */ + if (onEvent) { + report_eod_matches(id, scratch, onEvent, context); + + if (unlikely(internal_matching_error(scratch))) { + unmarkScratchInUse(scratch); + return HS_UNKNOWN_ERROR; + } else if (told_to_stop_matching(scratch)) { + unmarkScratchInUse(scratch); + return HS_SCAN_TERMINATED; + } + } + + unmarkScratchInUse(scratch); + return HS_SUCCESS; +} + +HS_PUBLIC_API +hs_error_t HS_CDECL hs_compress_stream(const hs_stream_t *stream, char *buf, + size_t buf_space, size_t *used_space) { + if (unlikely(!stream || !used_space)) { + return HS_INVALID; + } + + if (unlikely(buf_space && !buf)) { + return HS_INVALID; + } + + const struct RoseEngine *rose = stream->rose; + + size_t stream_size = size_compress_stream(rose, stream); + + DEBUG_PRINTF("require %zu [orig %zu]\n", stream_size, + rose->stateOffsets.end + sizeof(struct hs_stream)); + *used_space = stream_size; + + if (buf_space < stream_size) { + return HS_INSUFFICIENT_SPACE; + } + compress_stream(buf, stream_size, rose, stream); + + return HS_SUCCESS; +} + +HS_PUBLIC_API +hs_error_t HS_CDECL hs_expand_stream(const hs_database_t *db, + hs_stream_t **stream, + const char *buf, size_t buf_size) { + if (unlikely(!stream || !buf)) { + return HS_INVALID; + } + + *stream = NULL; + + hs_error_t err = validDatabase(db); + if (unlikely(err != HS_SUCCESS)) { + return err; + } + + const struct RoseEngine *rose = hs_get_bytecode(db); + if (unlikely(!ISALIGNED_16(rose))) { + return HS_INVALID; + } + + if (unlikely(rose->mode != HS_MODE_STREAM)) { + return HS_DB_MODE_ERROR; + } + + size_t stream_size = rose->stateOffsets.end + sizeof(struct hs_stream); + + struct hs_stream *s = hs_stream_alloc(stream_size); + if (unlikely(!s)) { + return HS_NOMEM; + } + + if (!expand_stream(s, rose, buf, buf_size)) { + hs_stream_free(s); + return HS_INVALID; + } + + *stream = s; + return HS_SUCCESS; +} + +HS_PUBLIC_API +hs_error_t HS_CDECL hs_reset_and_expand_stream(hs_stream_t *to_stream, + const char *buf, size_t buf_size, + hs_scratch_t *scratch, + match_event_handler onEvent, + void *context) { + if (unlikely(!to_stream || !buf)) { + return HS_INVALID; + } + + const struct RoseEngine *rose = to_stream->rose; + + if (onEvent) { + if (!scratch || !validScratch(to_stream->rose, scratch)) { + return HS_INVALID; + } + if (unlikely(markScratchInUse(scratch))) { + return HS_SCRATCH_IN_USE; + } + report_eod_matches(to_stream, scratch, onEvent, context); + if (unlikely(internal_matching_error(scratch))) { + unmarkScratchInUse(scratch); + return HS_UNKNOWN_ERROR; + } + unmarkScratchInUse(scratch); + } + + if (expand_stream(to_stream, rose, buf, buf_size)) { + return HS_SUCCESS; + } else { + return HS_INVALID; + } +} diff --git a/regex/scratch.c b/regex/scratch.c new file mode 100644 index 000000000..1e620fe73 --- /dev/null +++ b/regex/scratch.c @@ -0,0 +1,466 @@ +/* + * Copyright (c) 2015-2019, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Functions for allocating and manipulating scratch space. + */ + +#ifndef __KERNEL__ +#include +#include +#else +#include +#include +#endif + +#include "allocator.h" +#include "hs_internal.h" +#include "hs_runtime.h" +#include "scratch.h" +#include "state.h" +#include "ue2common.h" +#include "database.h" +#include "nfa/nfa_api_queue.h" +#include "rose/rose_internal.h" +#include "util/fatbit.h" + +/** + * Determine the space required for a correctly aligned array of fatbit + * structure, laid out as: + * + * - an array of num_entries pointers, each to a fatbit. + * - an array of fatbit structures, each of size fatbit_len. + * + * fatbit_len should have been determined at compile time, via the + * fatbit_size() call. + */ +static +size_t fatbit_array_size(u32 num_entries, u32 fatbit_len) { + size_t len = 0; + + // Array of pointers to each fatbit entry. + len += sizeof(struct fatbit *) * num_entries; + + // Fatbit entries themselves. + len = ROUNDUP_N(len, alignof(struct fatbit)); + len += (size_t)fatbit_len * num_entries; + + return ROUNDUP_N(len, 8); // Round up for potential padding. +} + +/** Used by hs_alloc_scratch and hs_clone_scratch to allocate a complete + * scratch region from a prototype structure. */ +static +hs_error_t alloc_scratch(const hs_scratch_t *proto, hs_scratch_t **scratch) { + u32 queueCount = proto->queueCount; + u32 activeQueueArraySize = proto->activeQueueArraySize; + u32 deduperCount = proto->deduper.dkey_count; + u32 deduperLogSize = proto->deduper.log_size; + u32 bStateSize = proto->bStateSize; + u32 tStateSize = proto->tStateSize; + u32 fullStateSize = proto->fullStateSize; + u32 anchored_literal_region_len = proto->anchored_literal_region_len; + u32 anchored_literal_fatbit_size = proto->anchored_literal_fatbit_size; + + u32 som_store_size = proto->som_store_count * sizeof(u64a); + u32 som_attempted_store_size = proto->som_store_count * sizeof(u64a); + u32 som_now_size = proto->som_fatbit_size; + u32 som_attempted_size = proto->som_fatbit_size; + + struct hs_scratch *s; + struct hs_scratch *s_tmp; + size_t queue_size = queueCount * sizeof(struct mq); + size_t qmpq_size = queueCount * sizeof(struct queue_match); + + assert(anchored_literal_region_len < 8 * sizeof(s->al_log_sum)); + + size_t anchored_literal_region_size = fatbit_array_size( + anchored_literal_region_len, proto->anchored_literal_fatbit_size); + size_t delay_region_size = + fatbit_array_size(DELAY_SLOT_COUNT, proto->delay_fatbit_size); + + // the size is all the allocated stuff, not including the struct itself + size_t size = queue_size + 63 + + bStateSize + tStateSize + + fullStateSize + 63 /* cacheline padding */ + + proto->handledKeyFatbitSize /* handled roles */ + + activeQueueArraySize /* active queue array */ + + 2 * deduperLogSize /* need odd and even logs */ + + 2 * deduperLogSize /* ditto som logs */ + + 2 * sizeof(u64a) * deduperCount /* start offsets for som */ + + anchored_literal_region_size + qmpq_size + + delay_region_size + + som_store_size + + som_now_size + + som_attempted_size + + som_attempted_store_size + 15; + + /* the struct plus the allocated stuff plus padding for cacheline + * alignment */ + size_t alloc_size = sizeof(struct hs_scratch) + size + 256; + if (!*scratch) { + s_tmp = hs_scratch_alloc(alloc_size); + hs_error_t err = hs_check_alloc(s_tmp); + if (err != HS_SUCCESS) { + hs_scratch_free(s_tmp); + return err; + } + + memset(s_tmp, 0, alloc_size); + s = ROUNDUP_PTR(s_tmp, 64); + DEBUG_PRINTF("allocated %zu bytes at %p but realigning to %p\n", alloc_size, s_tmp, s); + DEBUG_PRINTF("sizeof %zu\n", sizeof(struct hs_scratch)); + } else { + s = *scratch; + assert(proto->scratchSize == alloc_size); + s_tmp = (hs_scratch_t *)s->scratch_alloc; + } + + *s = *proto; + + s->magic = SCRATCH_MAGIC; + s->in_use = 0; + s->scratchSize = alloc_size; + s->scratch_alloc = (char *)s_tmp; + s->fdr_conf = NULL; + + // each of these is at an offset from the previous + char *current = (char *)s + sizeof(*s); + + // align current so that the following arrays are naturally aligned: this + // is accounted for in the padding allocated + current = ROUNDUP_PTR(current, 8); + + s->queues = (struct mq *)current; + current += queue_size; + + assert(ISALIGNED_N(current, 8)); + s->som_store = (u64a *)current; + current += som_store_size; + + s->som_attempted_store = (u64a *)current; + current += som_attempted_store_size; + + current = ROUNDUP_PTR(current, alignof(struct fatbit *)); + s->delay_slots = (struct fatbit **)current; + current += sizeof(struct fatbit *) * DELAY_SLOT_COUNT; + current = ROUNDUP_PTR(current, alignof(struct fatbit)); + for (u32 i = 0; i < DELAY_SLOT_COUNT; i++) { + s->delay_slots[i] = (struct fatbit *)current; + assert(ISALIGNED(s->delay_slots[i])); + current += proto->delay_fatbit_size; + } + + current = ROUNDUP_PTR(current, alignof(struct fatbit *)); + s->al_log = (struct fatbit **)current; + current += sizeof(struct fatbit *) * anchored_literal_region_len; + current = ROUNDUP_PTR(current, alignof(struct fatbit)); + for (u32 i = 0; i < anchored_literal_region_len; i++) { + s->al_log[i] = (struct fatbit *)current; + assert(ISALIGNED(s->al_log[i])); + current += anchored_literal_fatbit_size; + } + + current = ROUNDUP_PTR(current, 8); + s->catchup_pq.qm = (struct queue_match *)current; + current += qmpq_size; + + s->bstate = (char *)current; + s->bStateSize = bStateSize; + current += bStateSize; + + s->tstate = (char *)current; + s->tStateSize = tStateSize; + current += tStateSize; + + current = ROUNDUP_PTR(current, 64); + + assert(ISALIGNED_N(current, 8)); + s->deduper.som_start_log[0] = (u64a *)current; + current += sizeof(u64a) * deduperCount; + + s->deduper.som_start_log[1] = (u64a *)current; + current += sizeof(u64a) * deduperCount; + + assert(ISALIGNED_N(current, 8)); + s->aqa = (struct fatbit *)current; + current += activeQueueArraySize; + + s->handled_roles = (struct fatbit *)current; + current += proto->handledKeyFatbitSize; + + s->deduper.log[0] = (struct fatbit *)current; + current += deduperLogSize; + + s->deduper.log[1] = (struct fatbit *)current; + current += deduperLogSize; + + s->deduper.som_log[0] = (struct fatbit *)current; + current += deduperLogSize; + + s->deduper.som_log[1] = (struct fatbit *)current; + current += deduperLogSize; + + s->som_set_now = (struct fatbit *)current; + current += som_now_size; + + s->som_attempted_set = (struct fatbit *)current; + current += som_attempted_size; + + current = ROUNDUP_PTR(current, 64); + assert(ISALIGNED_CL(current)); + s->fullState = (char *)current; + s->fullStateSize = fullStateSize; + current += fullStateSize; + + *scratch = s; + + // Don't get too big for your boots + assert((size_t)(current - (char *)s) <= alloc_size); + + // Init q->scratch ptr for every queue. + for (struct mq *qi = s->queues; qi != s->queues + queueCount; ++qi) { + qi->scratch = s; + } + + return HS_SUCCESS; +} + +HS_PUBLIC_API +hs_error_t HS_CDECL hs_alloc_scratch(const hs_database_t *db, + hs_scratch_t **scratch) { + if (!db || !scratch) { + return HS_INVALID; + } + + /* We need to do some real sanity checks on the database as some users mmap + * in old deserialised databases, so this is the first real opportunity we + * have to make sure it is sane. + */ + hs_error_t rv = dbIsValid(db); + if (rv != HS_SUCCESS) { + return rv; + } + + /* We can also sanity-check the scratch parameter: if it points to an + * existing scratch area, that scratch should have valid magic bits. */ + if (*scratch != NULL) { + /* has to be aligned before we can do anything with it */ + if (!ISALIGNED_CL(*scratch)) { + return HS_INVALID; + } + if ((*scratch)->magic != SCRATCH_MAGIC) { + return HS_INVALID; + } + if (markScratchInUse(*scratch)) { + return HS_SCRATCH_IN_USE; + } + } + + const struct RoseEngine *rose = hs_get_bytecode(db); + int resize = 0; + + hs_scratch_t *proto; + hs_scratch_t *proto_tmp = hs_scratch_alloc(sizeof(struct hs_scratch) + 256); + hs_error_t proto_ret = hs_check_alloc(proto_tmp); + if (proto_ret != HS_SUCCESS) { + hs_scratch_free(proto_tmp); + if (*scratch) { + hs_scratch_free((*scratch)->scratch_alloc); + } + *scratch = NULL; + return proto_ret; + } + + proto = ROUNDUP_PTR(proto_tmp, 64); + + if (*scratch) { + *proto = **scratch; + } else { + memset(proto, 0, sizeof(*proto)); + resize = 1; + } + proto->scratch_alloc = (char *)proto_tmp; + + if (rose->anchoredDistance > proto->anchored_literal_region_len) { + resize = 1; + proto->anchored_literal_region_len = rose->anchoredDistance; + } + + if (rose->anchored_fatbit_size > proto->anchored_literal_fatbit_size) { + resize = 1; + proto->anchored_literal_fatbit_size = rose->anchored_fatbit_size; + } + + if (rose->delay_fatbit_size > proto->delay_fatbit_size) { + resize = 1; + proto->delay_fatbit_size = rose->delay_fatbit_size; + } + + if (rose->handledKeyFatbitSize > proto->handledKeyFatbitSize) { + resize = 1; + proto->handledKeyFatbitSize = rose->handledKeyFatbitSize; + } + + if (rose->tStateSize > proto->tStateSize) { + resize = 1; + proto->tStateSize = rose->tStateSize; + } + + u32 som_store_count = rose->somLocationCount; + if (som_store_count > proto->som_store_count) { + resize = 1; + proto->som_store_count = som_store_count; + } + + if (rose->somLocationFatbitSize > proto->som_fatbit_size) { + resize = 1; + proto->som_fatbit_size = rose->somLocationFatbitSize; + } + + u32 queueCount = rose->queueCount; + if (queueCount > proto->queueCount) { + resize = 1; + proto->queueCount = queueCount; + } + + if (rose->activeQueueArraySize > proto->activeQueueArraySize) { + resize = 1; + proto->activeQueueArraySize = rose->activeQueueArraySize; + } + + u32 bStateSize = 0; + if (rose->mode == HS_MODE_BLOCK) { + bStateSize = rose->stateOffsets.end; + } else if (rose->mode == HS_MODE_VECTORED) { + /* vectoring database require a full stream state (inc header) */ + bStateSize = sizeof(struct hs_stream) + rose->stateOffsets.end; + } + + if (bStateSize > proto->bStateSize) { + resize = 1; + proto->bStateSize = bStateSize; + } + + u32 fullStateSize = rose->scratchStateSize; + if (fullStateSize > proto->fullStateSize) { + resize = 1; + proto->fullStateSize = fullStateSize; + } + + if (rose->dkeyCount > proto->deduper.dkey_count) { + resize = 1; + proto->deduper.dkey_count = rose->dkeyCount; + proto->deduper.log_size = rose->dkeyLogSize; + } + + if (resize) { + if (*scratch) { + hs_scratch_free((*scratch)->scratch_alloc); + *scratch = NULL; + } + + hs_error_t alloc_ret = alloc_scratch(proto, scratch); + hs_scratch_free(proto_tmp); /* kill off temp used for sizing */ + if (alloc_ret != HS_SUCCESS) { + *scratch = NULL; + return alloc_ret; + } + } else { + hs_scratch_free(proto_tmp); /* kill off temp used for sizing */ + unmarkScratchInUse(*scratch); + } + + assert(!(*scratch)->in_use); + return HS_SUCCESS; +} + +HS_PUBLIC_API +hs_error_t HS_CDECL hs_clone_scratch(const hs_scratch_t *src, + hs_scratch_t **dest) { + if (!dest || !src || !ISALIGNED_CL(src) || src->magic != SCRATCH_MAGIC) { + return HS_INVALID; + } + + *dest = NULL; + hs_error_t ret = alloc_scratch(src, dest); + if (ret != HS_SUCCESS) { + *dest = NULL; + return ret; + } + + assert(!(*dest)->in_use); + return HS_SUCCESS; +} + +HS_PUBLIC_API +hs_error_t HS_CDECL hs_init_scratch(const hs_scratch_t *src, hs_scratch_t *dest) { + if (!src || !ISALIGNED_CL(src) || src->magic != SCRATCH_MAGIC) + return HS_INVALID; + if (!dest || !ISALIGNED_CL(dest)) + return HS_INVALID; + + memset(dest, 0, src->scratchSize); + return alloc_scratch(src, &dest); +} + +HS_PUBLIC_API +hs_error_t HS_CDECL hs_free_scratch(hs_scratch_t *scratch) { + if (scratch) { + /* has to be aligned before we can do anything with it */ + if (!ISALIGNED_CL(scratch)) { + return HS_INVALID; + } + if (scratch->magic != SCRATCH_MAGIC) { + return HS_INVALID; + } + if (markScratchInUse(scratch)) { + return HS_SCRATCH_IN_USE; + } + + scratch->magic = 0; + assert(scratch->scratch_alloc); + DEBUG_PRINTF("scratch %p is really at %p : freeing\n", scratch, + scratch->scratch_alloc); + hs_scratch_free(scratch->scratch_alloc); + } + + return HS_SUCCESS; +} + +HS_PUBLIC_API +hs_error_t HS_CDECL hs_scratch_size(const hs_scratch_t *scratch, size_t *size) { + if (!size || !scratch || !ISALIGNED_CL(scratch) || + scratch->magic != SCRATCH_MAGIC) { + return HS_INVALID; + } + + *size = scratch->scratchSize; + + return HS_SUCCESS; +} diff --git a/regex/scratch.h b/regex/scratch.h new file mode 100644 index 000000000..1256f7aba --- /dev/null +++ b/regex/scratch.h @@ -0,0 +1,276 @@ +/* + * Copyright (c) 2015-2019, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Scratch and associated data structures. + * + * This header gets pulled into many places (many deep, slow to compile + * places). Try to keep the included headers under control. + */ + +#ifndef SCRATCH_H_DA6D4FC06FF410 +#define SCRATCH_H_DA6D4FC06FF410 + +#include "hs_common.h" +#include "ue2common.h" +#include "rose/rose_types.h" + +#ifdef __cplusplus +extern "C" +{ +#endif + +UNUSED static const u32 SCRATCH_MAGIC = 0x544F4259; + +struct fatbit; +struct hs_scratch; +struct RoseEngine; +struct mq; + +struct queue_match { + /** \brief used to store the current location of an (suf|out)fix match in + * the current buffer. + * + * As (suf|out)fixes always run in the main buffer and never in history + * this number will always be positive (matches at 0 belong to previous + * write). Hence we can get away with a size_t rather than the usual s64a + * for a location. */ + size_t loc; + + u32 queue; /**< queue index. */ +}; + +struct catchup_pq { + struct queue_match *qm; + u32 qm_size; /**< current size of the priority queue */ +}; + +/** \brief Status flag: user requested termination. */ +#define STATUS_TERMINATED (1U << 0) + +/** \brief Status flag: it has been determined that it is not possible for this + * stream to raise any more matches. + * + * This may be because all its exhaustion keys are on or for other reasons + * (anchored sections not matching). */ +#define STATUS_EXHAUSTED (1U << 1) + +/** \brief Status flag: Rose requires rebuild as delay literal matched in + * history. */ +#define STATUS_DELAY_DIRTY (1U << 2) + +/** \brief Status flag: Unexpected Rose program error. */ +#define STATUS_ERROR (1U << 3) + +/** \brief Core information about the current scan, used everywhere. */ +struct core_info { + void *userContext; /**< user-supplied context */ + + /** \brief user-supplied match callback */ + int (HS_CDECL *userCallback)(unsigned int id, unsigned long long from, + unsigned long long to, unsigned int flags, + void *ctx); + + const struct RoseEngine *rose; + char *state; /**< full stream state */ + char *exhaustionVector; /**< pointer to evec for this stream */ + char *logicalVector; /**< pointer to lvec for this stream */ + char *combVector; /**< pointer to cvec for this stream */ + const u8 *buf; /**< main scan buffer */ + size_t len; /**< length of main scan buffer in bytes */ + const u8 *hbuf; /**< history buffer */ + size_t hlen; /**< length of history buffer in bytes. */ + u64a buf_offset; /**< stream offset, for the base of the buffer */ + u8 status; /**< stream status bitmask, using STATUS_ flags above */ +}; + +/** \brief Rose state information. */ +struct RoseContext { + u8 mpv_inactive; + u64a groups; + u64a lit_offset_adjust; /**< offset to add to matches coming from hwlm */ + u64a delayLastEndOffset; /**< end of the last match from FDR used by delay + * code */ + u64a lastEndOffset; /**< end of the last match from FDR/anchored DFAs used + * by history code. anchored DFA matches update this + * when they are inserted into the literal match + * stream */ + u64a lastMatchOffset; /**< last match offset report up out of rose; + * used _only_ for debugging, asserts */ + u64a lastCombMatchOffset; /**< last match offset of active combinations */ + u64a minMatchOffset; /**< the earliest offset that we are still allowed to + * report */ + u64a minNonMpvMatchOffset; /**< the earliest offset that non-mpv engines are + * still allowed to report */ + u64a next_mpv_offset; /**< earliest offset that the MPV can next report a + * match, cleared if top events arrive */ + u32 filledDelayedSlots; + u32 curr_qi; /**< currently executing main queue index during + * \ref nfaQueueExec */ + + /** + * \brief Buffer for caseful long literal support, used in streaming mode + * only. + * + * If a long literal prefix was at the end of the buffer at the end of a + * stream write, then the long lit table hashes it and stores the result in + * stream state. At the start of the next write, this value is used to set + * this buffer to the matching prefix string (stored in the bytecode. + */ + const u8 *ll_buf; + + /** \brief Length in bytes of the string pointed to by ll_buf. */ + size_t ll_len; + + /** \brief Caseless version of ll_buf. */ + const u8 *ll_buf_nocase; + + /** \brief Length in bytes of the string pointed to by ll_buf_nocase. */ + size_t ll_len_nocase; +}; + +struct match_deduper { + struct fatbit *log[2]; /**< even, odd logs */ + struct fatbit *som_log[2]; /**< even, odd fatbit logs for som */ + u64a *som_start_log[2]; /**< even, odd start offset logs for som */ + u32 dkey_count; + u32 log_size; + u64a current_report_offset; + u8 som_log_dirty; +}; + +/** \brief Hyperscan scratch region header. + * + * NOTE: there is no requirement that scratch is 16-byte aligned, as it is + * allocated by a malloc equivalent, possibly supplied by the user. + */ +struct ALIGN_CL_DIRECTIVE hs_scratch { + u32 magic; + u8 in_use; /**< non-zero when being used by an API call. */ + u32 queueCount; + u32 activeQueueArraySize; /**< size of active queue array fatbit in bytes */ + u32 bStateSize; /**< sizeof block mode states */ + u32 tStateSize; /**< sizeof transient rose states */ + u32 fullStateSize; /**< size of uncompressed nfa state */ + struct RoseContext tctxt; + char *bstate; /**< block mode states */ + char *tstate; /**< state for transient roses */ + char *fullState; /**< uncompressed NFA state */ + struct mq *queues; + struct fatbit *aqa; /**< active queue array; fatbit of queues that are valid + * & active */ + struct fatbit **delay_slots; + struct fatbit **al_log; + u64a al_log_sum; + struct catchup_pq catchup_pq; + struct core_info core_info; + struct match_deduper deduper; + u32 anchored_literal_region_len; + u32 anchored_literal_fatbit_size; /**< size of each anch fatbit in bytes */ + struct fatbit *handled_roles; /**< fatbit of ROLES (not states) already + * handled by this literal */ + u64a *som_store; /**< array of som locations */ + u64a *som_attempted_store; /**< array of som locations for fail stores */ + struct fatbit *som_set_now; /**< fatbit, true if the som location was set + * based on a match at the current offset */ + struct fatbit *som_attempted_set; /**< fatbit, true if the som location + * would have been set at the current offset if the + * location had been writable */ + u64a som_set_now_offset; /**< offset at which som_set_now represents */ + u32 som_store_count; + u32 som_fatbit_size; /**< size of som location fatbit structures in bytes */ + u32 handledKeyFatbitSize; /**< size of handled_keys fatbit in bytes */ + u32 delay_fatbit_size; /**< size of each delay fatbit in bytes */ + u32 scratchSize; + char *scratch_alloc; /* user allocated scratch object */ + u64a *fdr_conf; /**< FDR confirm value */ + u8 fdr_conf_offset; /**< offset where FDR/Teddy front end matches + * in buffer */ +}; + +/* array of fatbit ptr; TODO: why not an array of fatbits? */ +static really_inline +struct fatbit **getAnchoredLiteralLog(struct hs_scratch *scratch) { + return scratch->al_log; +} + +static really_inline +struct fatbit **getDelaySlots(struct hs_scratch *scratch) { + return scratch->delay_slots; +} + +static really_inline +char told_to_stop_matching(const struct hs_scratch *scratch) { + return scratch->core_info.status & STATUS_TERMINATED; +} + +static really_inline +char can_stop_matching(const struct hs_scratch *scratch) { + return scratch->core_info.status & + (STATUS_TERMINATED | STATUS_EXHAUSTED | STATUS_ERROR); +} + +static really_inline +char internal_matching_error(const struct hs_scratch *scratch) { + return scratch->core_info.status & STATUS_ERROR; +} + +/** + * \brief Mark scratch as in use. + * + * Returns non-zero if it was already in use, zero otherwise. + */ +static really_inline +char markScratchInUse(struct hs_scratch *scratch) { + DEBUG_PRINTF("marking scratch as in use\n"); + assert(scratch && scratch->magic == SCRATCH_MAGIC); + if (scratch->in_use) { + DEBUG_PRINTF("scratch already in use!\n"); + return 1; + } + scratch->in_use = 1; + return 0; +} + +/** + * \brief Mark scratch as no longer in use. + */ +static really_inline +void unmarkScratchInUse(struct hs_scratch *scratch) { + DEBUG_PRINTF("marking scratch as not in use\n"); + assert(scratch && scratch->magic == SCRATCH_MAGIC); + assert(scratch->in_use == 1); + scratch->in_use = 0; +} + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* SCRATCH_H_DA6D4FC06FF410 */ + diff --git a/regex/smallwrite/smallwrite_internal.h b/regex/smallwrite/smallwrite_internal.h new file mode 100644 index 000000000..8f350dbea --- /dev/null +++ b/regex/smallwrite/smallwrite_internal.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2015, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SMALLWRITE_INTERNAL_H +#define SMALLWRITE_INTERNAL_H + +#include "ue2common.h" + +// Runtime structure header for SmallWrite. +struct ALIGN_CL_DIRECTIVE SmallWriteEngine { + u32 largestBuffer; /**< largest buffer that can be considered small write */ + u32 start_offset; /**< where to start scanning in the buffer. */ + u32 size; /**< size of the small write engine in bytes (including the nfa) */ +}; + +struct NFA; + +static really_inline +const struct NFA *getSmwrNfa(const struct SmallWriteEngine *smwr) { + assert(smwr); + const struct NFA *n + = (const struct NFA *)((const char *)smwr + sizeof(*smwr)); + assert(ISALIGNED_CL(n)); + return n; +} + +#endif // SMALLWRITE_INTERNAL_H + diff --git a/regex/som/som_operation.h b/regex/som/som_operation.h new file mode 100644 index 000000000..d85ad2268 --- /dev/null +++ b/regex/som/som_operation.h @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * \file + * \brief SOM runtime: data structures. + * + * Data structures used for SOM operations. + */ + +#ifndef SOM_OPERATION_H +#define SOM_OPERATION_H + +#include "ue2common.h" + +#define SOM_EXTERNAL_CALLBACK_REL 1 +#define SOM_INTERNAL_LOC_SET 2 +#define SOM_INTERNAL_LOC_SET_IF_UNSET 3 +#define SOM_INTERNAL_LOC_SET_IF_WRITABLE 4 +#define SOM_INTERNAL_LOC_SET_REV_NFA 5 +#define SOM_INTERNAL_LOC_SET_REV_NFA_IF_UNSET 6 +#define SOM_INTERNAL_LOC_SET_REV_NFA_IF_WRITABLE 7 +#define SOM_INTERNAL_LOC_COPY 8 +#define SOM_INTERNAL_LOC_COPY_IF_WRITABLE 9 +#define SOM_INTERNAL_LOC_MAKE_WRITABLE 10 +#define SOM_EXTERNAL_CALLBACK_STORED 11 +#define SOM_EXTERNAL_CALLBACK_ABS 12 +#define SOM_EXTERNAL_CALLBACK_REV_NFA 13 +#define SOM_INTERNAL_LOC_SET_FROM 14 +#define SOM_INTERNAL_LOC_SET_FROM_IF_WRITABLE 15 + +struct som_operation { + /** \brief Report type, from the definitions above. */ + u8 type; + + /* \brief SOM loc to modify. */ + u32 onmatch; + + union { + /** \brief SOM distance value, use varies according to type. + * + * - for SOM_EXTERNAL_CALLBACK_REL, from-offset is this many bytes + * before the to-offset. + * - for SOM_EXTERNAL_CALLBACK_ABS, set from-offset to this value. + * - for SOM_INTERNAL_LOC_COPY*, som location read_from. + */ + u64a somDistance; + + /** \brief Index of the reverse nfa. + * + * Used by SOM_EXTERNAL_CALLBACK_REV_NFA and + * SOM_INTERNAL_LOC_SET_REV_NFA*. + */ + u64a revNfaIndex; + } aux; +}; + +#endif // SOM_OPERATION_H + diff --git a/regex/som/som_runtime.c b/regex/som/som_runtime.c new file mode 100644 index 000000000..1a868efc9 --- /dev/null +++ b/regex/som/som_runtime.c @@ -0,0 +1,535 @@ +/* + * Copyright (c) 2015-2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief SOM runtime code. + * + * + * Runtime code for SOM handling called by the Rose callback adaptors. + * + * Note: + * Races between escapes making a som loc writeable and attempts to write to it + * at the same to_offset are always resolved as if the escape arrived first + * and then the request to write to that location. + */ + +#include "hs_internal.h" +#include "som_operation.h" +#include "som_runtime.h" +#include "scratch.h" +#include "ue2common.h" +#include "rose/rose_internal.h" +#include "nfa/nfa_api.h" +#include "nfa/nfa_internal.h" +#include "util/fatbit.h" +#include "util/multibit.h" + +static really_inline +void setSomLoc(struct fatbit *som_set_now, u64a *som_store, u32 som_store_count, + const struct som_operation *ri, u64a to_offset) { + /* validity handled by callers */ + assert(to_offset >= ri->aux.somDistance); + u64a start_offset = to_offset - ri->aux.somDistance; + u32 som_loc = ri->onmatch; + + /* resolve any races for matches at this point in favour of the earliest som + */ + if (!fatbit_set(som_set_now, som_store_count, som_loc)) { + som_store[som_loc] = start_offset; + } else { + LIMIT_TO_AT_MOST(&som_store[som_loc], start_offset); + } + + DEBUG_PRINTF("som_store[%u] set to %llu\n", som_loc, som_store[som_loc]); +} + +static really_inline +char ok_and_mark_if_write(u8 *som_store_valid, struct fatbit *som_set_now, + u8 *som_store_writable, u32 som_store_count, + u32 loc) { + return !mmbit_set(som_store_valid, som_store_count, loc) /* unwritten */ + || fatbit_isset(som_set_now, som_store_count, loc) /* write here, need + * to resolve race */ + || mmbit_isset(som_store_writable, som_store_count, loc); /* writable */ +} + +static really_inline +char ok_and_mark_if_unset(u8 *som_store_valid, struct fatbit *som_set_now, + u32 som_store_count, u32 loc) { + return !mmbit_set(som_store_valid, som_store_count, loc) /* unwritten */ + || fatbit_isset(som_set_now, som_store_count, loc); /* write here, need + * to resolve race */ +} + +static +int somRevCallback(UNUSED u64a start, u64a end, ReportID id, void *ctx) { + DEBUG_PRINTF("offset=%llu, id=%u\n", end, id); + + // We use the id to store the offset adjustment (for assertions like a + // leading \b or multiline mode). + assert(id <= 1); + u64a *from_offset = ctx; + LIMIT_TO_AT_MOST(from_offset, end + id); + return 1; // continue matching. +} + +static really_inline +const struct NFA *getSomRevNFA(const struct RoseEngine *t, u32 i) { + assert(t->somRevOffsetOffset); + const u32 *rev_offsets + = (const u32 *)((const u8 *)t + t->somRevOffsetOffset); + u32 nfa_offset = rev_offsets[i]; + assert(nfa_offset && nfa_offset < t->size); + const struct NFA *n = (const struct NFA *)(((const u8 *)t + nfa_offset)); + assert(ISALIGNED(n)); + + return n; +} + +static +void runRevNfa(struct hs_scratch *scratch, const struct som_operation *ri, + const u64a to_offset, u64a *from_offset) { + struct core_info *ci = &scratch->core_info; + + DEBUG_PRINTF("buf has %zu bytes total, history has %zu\n", + ci->len, ci->hlen); + + u32 nfa_idx = ri->aux.revNfaIndex; + DEBUG_PRINTF("run rev nfa %u from to_offset=%llu\n", nfa_idx, to_offset); + const struct NFA *nfa = getSomRevNFA(ci->rose, nfa_idx); + + assert(nfa->maxWidth); // No inf width rev NFAs. + + size_t buf_bytes = to_offset - ci->buf_offset; + size_t history_bytes = ci->hlen; + + DEBUG_PRINTF("nfa min/max widths [%u,%u], %zu in buffer, %zu in history\n", + nfa->minWidth, nfa->maxWidth, buf_bytes, history_bytes); + assert(nfa->minWidth <= buf_bytes + history_bytes); + + const u8 *buf = ci->buf; + const u8 *hbuf = ci->hbuf; + + // Work out if we need to scan any history as well. + if (history_bytes && buf_bytes < nfa->maxWidth) { + assert(hbuf); + size_t remainder = nfa->maxWidth - buf_bytes; + if (remainder < history_bytes) { + hbuf += history_bytes - remainder; + history_bytes = remainder; + } + } + + DEBUG_PRINTF("scanning %zu from buffer and %zu from history\n", buf_bytes, + history_bytes); + + *from_offset = to_offset; + + nfaBlockExecReverse(nfa, to_offset, buf, buf_bytes, hbuf, history_bytes, + somRevCallback, from_offset); + + assert(*from_offset <= to_offset); +} + +static really_inline +void setSomLocRevNfa(struct hs_scratch *scratch, struct fatbit *som_set_now, + u64a *som_store, u32 som_store_count, + const struct som_operation *ri, u64a to_offset) { + /* validity handled by callers */ + u64a from_offset = 0; + runRevNfa(scratch, ri, to_offset, &from_offset); + + u32 som_loc = ri->onmatch; + + /* resolve any races for matches at this point in favour of the earliest som + */ + if (!fatbit_set(som_set_now, som_store_count, som_loc)) { + som_store[som_loc] = from_offset; + } else { + LIMIT_TO_AT_MOST(&som_store[som_loc], from_offset); + } + + DEBUG_PRINTF("som_store[%u] set to %llu\n", som_loc, som_store[som_loc]); +} + +void handleSomInternal(struct hs_scratch *scratch, + const struct som_operation *ri, const u64a to_offset) { + assert(scratch); + assert(ri); + DEBUG_PRINTF("-->som action required at %llu\n", to_offset); + + // SOM handling at scan time operates on data held in scratch. In + // streaming mode, this data is read from / written out to stream state at + // stream write boundaries. + + struct core_info *ci = &scratch->core_info; + const struct RoseEngine *rose = ci->rose; + assert(rose->hasSom); + + const u32 som_store_count = rose->somLocationCount; + u8 *som_store_valid = (u8 *)ci->state + rose->stateOffsets.somValid; + u8 *som_store_writable = (u8 *)ci->state + rose->stateOffsets.somWritable; + struct fatbit *som_set_now = scratch->som_set_now; + struct fatbit *som_attempted_set = scratch->som_attempted_set; + u64a *som_store = scratch->som_store; + u64a *som_failed_store = scratch->som_attempted_store; + + if (to_offset != scratch->som_set_now_offset) { + assert(scratch->som_set_now_offset == ~0ULL + || to_offset > scratch->som_set_now_offset); + DEBUG_PRINTF("setting som_set_now_offset=%llu\n", to_offset); + fatbit_clear(som_set_now); + fatbit_clear(som_attempted_set); + scratch->som_set_now_offset = to_offset; + } + + switch (ri->type) { + case SOM_INTERNAL_LOC_SET: + DEBUG_PRINTF("SOM_INTERNAL_LOC_SET\n"); + mmbit_set(som_store_valid, som_store_count, ri->onmatch); + setSomLoc(som_set_now, som_store, som_store_count, ri, to_offset); + return; + case SOM_INTERNAL_LOC_SET_IF_UNSET: + DEBUG_PRINTF("SOM_INTERNAL_LOC_SET_IF_UNSET\n"); + if (ok_and_mark_if_unset(som_store_valid, som_set_now, som_store_count, + ri->onmatch)) { + setSomLoc(som_set_now, som_store, som_store_count, ri, to_offset); + } + return; + case SOM_INTERNAL_LOC_SET_IF_WRITABLE: { + u32 slot = ri->onmatch; + DEBUG_PRINTF("SOM_INTERNAL_LOC_SET_IF_WRITABLE\n"); + if (ok_and_mark_if_write(som_store_valid, som_set_now, + som_store_writable, som_store_count, slot)) { + setSomLoc(som_set_now, som_store, som_store_count, ri, to_offset); + mmbit_unset(som_store_writable, som_store_count, slot); + } else { + /* not writable, stash as an attempted write in case we are + * racing our escape. */ + DEBUG_PRINTF("not writable, stashing attempt\n"); + assert(to_offset >= ri->aux.somDistance); + u64a start_offset = to_offset - ri->aux.somDistance; + + if (!fatbit_set(som_attempted_set, som_store_count, slot)) { + som_failed_store[slot] = start_offset; + } else { + LIMIT_TO_AT_MOST(&som_failed_store[slot], start_offset); + } + DEBUG_PRINTF("som_failed_store[%u] = %llu\n", slot, + som_failed_store[slot]); + } + return; + } + case SOM_INTERNAL_LOC_SET_REV_NFA: + DEBUG_PRINTF("SOM_INTERNAL_LOC_SET_REV_NFA\n"); + mmbit_set(som_store_valid, som_store_count, ri->onmatch); + setSomLocRevNfa(scratch, som_set_now, som_store, som_store_count, ri, + to_offset); + return; + case SOM_INTERNAL_LOC_SET_REV_NFA_IF_UNSET: + DEBUG_PRINTF("SOM_INTERNAL_LOC_SET_REV_NFA_IF_UNSET\n"); + if (ok_and_mark_if_unset(som_store_valid, som_set_now, som_store_count, + ri->onmatch)) { + setSomLocRevNfa(scratch, som_set_now, som_store, som_store_count, + ri, to_offset); + } + return; + case SOM_INTERNAL_LOC_SET_REV_NFA_IF_WRITABLE: { + u32 slot = ri->onmatch; + DEBUG_PRINTF("SOM_INTERNAL_LOC_SET_IF_WRITABLE\n"); + if (ok_and_mark_if_write(som_store_valid, som_set_now, + som_store_writable, som_store_count, slot)) { + setSomLocRevNfa(scratch, som_set_now, som_store, som_store_count, + ri, to_offset); + mmbit_unset(som_store_writable, som_store_count, slot); + } else { + /* not writable, stash as an attempted write in case we are + * racing our escape. */ + DEBUG_PRINTF("not writable, stashing attempt\n"); + + u64a from_offset = 0; + runRevNfa(scratch, ri, to_offset, &from_offset); + + if (!fatbit_set(som_attempted_set, som_store_count, slot)) { + som_failed_store[slot] = from_offset; + } else { + LIMIT_TO_AT_MOST(&som_failed_store[slot], from_offset); + } + DEBUG_PRINTF("som_failed_store[%u] = %llu\n", slot, + som_failed_store[slot]); + } + return; + } + case SOM_INTERNAL_LOC_COPY: { + u32 slot_in = ri->aux.somDistance; + u32 slot_out = ri->onmatch; + DEBUG_PRINTF("SOM_INTERNAL_LOC_COPY S[%u] = S[%u]\n", slot_out, + slot_in); + assert(mmbit_isset(som_store_valid, som_store_count, slot_in)); + mmbit_set(som_store_valid, som_store_count, slot_out); + fatbit_set(som_set_now, som_store_count, slot_out); + som_store[slot_out] = som_store[slot_in]; + + return; + } + case SOM_INTERNAL_LOC_COPY_IF_WRITABLE: { + u32 slot_in = ri->aux.somDistance; + u32 slot_out = ri->onmatch; + DEBUG_PRINTF("SOM_INTERNAL_LOC_COPY_IF_WRITABLE S[%u] = S[%u]\n", + slot_out, slot_in); + assert(mmbit_isset(som_store_valid, som_store_count, slot_in)); + if (ok_and_mark_if_write(som_store_valid, som_set_now, + som_store_writable, som_store_count, + slot_out)) { + DEBUG_PRINTF("copy, set som_store[%u]=%llu\n", slot_out, + som_store[slot_in]); + som_store[slot_out] = som_store[slot_in]; + fatbit_set(som_set_now, som_store_count, slot_out); + mmbit_unset(som_store_writable, som_store_count, slot_out); + } else { + /* not writable, stash as an attempted write in case we are + * racing our escape */ + DEBUG_PRINTF("not writable, stashing attempt\n"); + fatbit_set(som_attempted_set, som_store_count, slot_out); + som_failed_store[slot_out] = som_store[slot_in]; + DEBUG_PRINTF("som_failed_store[%u] = %llu\n", slot_out, + som_failed_store[slot_out]); + } + return; + } + case SOM_INTERNAL_LOC_MAKE_WRITABLE: { + u32 slot = ri->onmatch; + DEBUG_PRINTF("SOM_INTERNAL_LOC_MAKE_WRITABLE\n"); + /* if just written to the loc, ignore the racing escape */ + if (fatbit_isset(som_set_now, som_store_count, slot)) { + DEBUG_PRINTF("just written\n"); + return; + } + if (fatbit_isset(som_attempted_set, som_store_count, slot)) { + /* writes were waiting for an escape to arrive */ + DEBUG_PRINTF("setting som_store[%u] = %llu from " + "som_failed_store[%u]\n", slot, som_failed_store[slot], + slot); + som_store[slot] = som_failed_store[slot]; + fatbit_set(som_set_now, som_store_count, slot); + return; + } + mmbit_set(som_store_writable, som_store_count, slot); + return; + } + default: + DEBUG_PRINTF("unknown report type!\n"); + break; + } + + // All valid som_operation types should be handled and returned above. + assert(0); + return; +} + +// Returns the SOM offset. +u64a handleSomExternal(struct hs_scratch *scratch, + const struct som_operation *ri, + const u64a to_offset) { + assert(scratch); + assert(ri); + + // SOM handling at scan time operates on data held in scratch. In + // streaming mode, this data is read from / written out to stream state at + // stream write boundaries. + + struct core_info *ci = &scratch->core_info; + const struct RoseEngine *rose = ci->rose; + assert(rose->hasSom); + + switch (ri->type) { + case SOM_EXTERNAL_CALLBACK_REL: + DEBUG_PRINTF("SOM_EXTERNAL_CALLBACK_REL: som is %llu chars back\n", + ri->aux.somDistance); + assert(to_offset >= ri->aux.somDistance); + return to_offset - ri->aux.somDistance; + case SOM_EXTERNAL_CALLBACK_ABS: + DEBUG_PRINTF("SOM_EXTERNAL_CALLBACK_ABS: som is at %llu\n", + ri->aux.somDistance); + assert(to_offset >= ri->aux.somDistance); + return ri->aux.somDistance; + case SOM_EXTERNAL_CALLBACK_STORED: { + const u64a *som_store = scratch->som_store; + u32 slot = ri->aux.somDistance; + DEBUG_PRINTF("SOM_EXTERNAL_CALLBACK_STORED: <- som_store[%u]=%llu\n", + slot, som_store[slot]); + + UNUSED const u32 som_store_count = rose->somLocationCount; + UNUSED const u8 *som_store_valid = (u8 *)ci->state + + rose->stateOffsets.somValid; + + assert(mmbit_isset(som_store_valid, som_store_count, slot)); + return som_store[slot]; + } + case SOM_EXTERNAL_CALLBACK_REV_NFA: { + DEBUG_PRINTF("SOM_EXTERNAL_CALLBACK_REV_NFA\n"); + u64a from_offset = 0; + runRevNfa(scratch, ri, to_offset, &from_offset); + return from_offset; + } + default: + DEBUG_PRINTF("unknown report type!\n"); + break; + } + + // All valid som_operation types should be handled and returned above. + assert(0); + return 0; +} + +void setSomFromSomAware(struct hs_scratch *scratch, + const struct som_operation *ri, u64a from_offset, + u64a to_offset) { + assert(scratch); + assert(ri); + assert(to_offset); + assert(ri->type == SOM_INTERNAL_LOC_SET_FROM + || ri->type == SOM_INTERNAL_LOC_SET_FROM_IF_WRITABLE); + + struct core_info *ci = &scratch->core_info; + const struct RoseEngine *rose = ci->rose; + assert(rose->hasSom); + + const u32 som_store_count = rose->somLocationCount; + u8 *som_store_valid = (u8 *)ci->state + rose->stateOffsets.somValid; + u8 *som_store_writable = (u8 *)ci->state + rose->stateOffsets.somWritable; + struct fatbit *som_set_now = scratch->som_set_now; + struct fatbit *som_attempted_set = scratch->som_attempted_set; + u64a *som_store = scratch->som_store; + u64a *som_failed_store = scratch->som_attempted_store; + + if (to_offset != scratch->som_set_now_offset) { + DEBUG_PRINTF("setting som_set_now_offset=%llu\n", to_offset); + fatbit_clear(som_set_now); + fatbit_clear(som_attempted_set); + scratch->som_set_now_offset = to_offset; + } + + if (ri->type == SOM_INTERNAL_LOC_SET_FROM) { + DEBUG_PRINTF("SOM_INTERNAL_LOC_SET_FROM\n"); + mmbit_set(som_store_valid, som_store_count, ri->onmatch); + setSomLoc(som_set_now, som_store, som_store_count, ri, from_offset); + } else { + DEBUG_PRINTF("SOM_INTERNAL_LOC_SET_FROM_IF_WRITABLE\n"); + if (ok_and_mark_if_write(som_store_valid, som_set_now, + som_store_writable, som_store_count, + ri->onmatch)) { + setSomLoc(som_set_now, som_store, som_store_count, ri, from_offset); + mmbit_unset(som_store_writable, som_store_count, ri->onmatch); + } else { + /* not writable, stash as an attempted write in case we are + * racing our escape. */ + DEBUG_PRINTF("not writable, stashing attempt\n"); + assert(to_offset >= ri->aux.somDistance); + u32 som_loc = ri->onmatch; + + if (!fatbit_set(som_attempted_set, som_store_count, ri->onmatch)) { + som_failed_store[som_loc] = from_offset; + } else { + LIMIT_TO_AT_MOST(&som_failed_store[som_loc], from_offset); + } + DEBUG_PRINTF("som_failed_store[%u] = %llu\n", som_loc, + som_failed_store[som_loc]); + } + } +} + +static really_inline +int clearSomLog(struct hs_scratch *scratch, u64a offset, struct fatbit *log, + const u64a *starts) { + DEBUG_PRINTF("at %llu\n", offset); + struct core_info *ci = &scratch->core_info; + const struct RoseEngine *rose = ci->rose; + const u32 dkeyCount = rose->dkeyCount; + const u32 *dkey_to_report = (const u32 *) + ((const char *)rose + rose->invDkeyOffset); + u32 flags = 0; +#ifndef RELEASE_BUILD + if (scratch->deduper.current_report_offset != offset) { + flags |= HS_MATCH_FLAG_ADJUSTED; + } +#endif + + for (u32 it = fatbit_iterate(log, dkeyCount, MMB_INVALID); + it != MMB_INVALID; it = fatbit_iterate(log, dkeyCount, it)) { + u64a from_offset = starts[it]; + u32 onmatch = dkey_to_report[it]; + int halt = ci->userCallback(onmatch, from_offset, offset, flags, + ci->userContext); + if (halt) { + ci->status |= STATUS_TERMINATED; + return 1; + } + } + fatbit_clear(log); + return 0; +} + +int flushStoredSomMatches_i(struct hs_scratch *scratch, u64a offset) { + DEBUG_PRINTF("flush som matches\n"); + int halt = 0; + + assert(!told_to_stop_matching(scratch)); + + if (scratch->deduper.current_report_offset == ~0ULL) { + /* no matches recorded yet; just need to clear the logs */ + fatbit_clear(scratch->deduper.som_log[0]); + fatbit_clear(scratch->deduper.som_log[1]); + scratch->deduper.som_log_dirty = 0; + return 0; + } + + /* fire any reports from the logs and clear them */ + if (offset == scratch->deduper.current_report_offset + 1) { + struct fatbit *done_log = scratch->deduper.som_log[offset % 2]; + u64a *done_starts = scratch->deduper.som_start_log[offset % 2]; + + halt = clearSomLog(scratch, scratch->deduper.current_report_offset - 1, + done_log, done_starts); + scratch->deduper.som_log_dirty >>= 1; + } else { + /* need to report both logs */ + u64a f_offset = scratch->deduper.current_report_offset - 1; + u64a s_offset = scratch->deduper.current_report_offset; + struct fatbit *first_log = scratch->deduper.som_log[f_offset % 2]; + u64a *first_starts = scratch->deduper.som_start_log[f_offset % 2]; + struct fatbit *second_log = scratch->deduper.som_log[s_offset % 2]; + u64a *second_starts = scratch->deduper.som_start_log[s_offset % 2]; + + halt = clearSomLog(scratch, f_offset, first_log, first_starts) || + clearSomLog(scratch, s_offset, second_log, second_starts); + scratch->deduper.som_log_dirty = 0; + } + + return halt; +} diff --git a/regex/som/som_runtime.h b/regex/som/som_runtime.h new file mode 100644 index 000000000..30c7ace8c --- /dev/null +++ b/regex/som/som_runtime.h @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2015, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** + * \file + * \brief SOM runtime code. + * + * Runtime code for SOM handling called by the Rose callback adaptors. + */ + +#ifndef SOM_RUNTIME_H +#define SOM_RUNTIME_H + +#include "scratch.h" +#include "ue2common.h" + +struct som_operation; + +void handleSomInternal(struct hs_scratch *scratch, + const struct som_operation *ri, const u64a to_offset); + +// Returns the from_offset. +u64a handleSomExternal(struct hs_scratch *scratch, + const struct som_operation *ri, const u64a to_offset); + +void setSomFromSomAware(struct hs_scratch *scratch, + const struct som_operation *ri, u64a from_offset, + u64a to_offset); + +int flushStoredSomMatches_i(struct hs_scratch *scratch, u64a offset); + +static really_inline +int flushStoredSomMatches(struct hs_scratch *scratch, u64a offset) { + if (scratch->deduper.som_log_dirty) { + return flushStoredSomMatches_i(scratch, offset); + } else { + return 0; + } +} + +#endif // SOM_RUNTIME_H + diff --git a/regex/som/som_stream.c b/regex/som/som_stream.c new file mode 100644 index 000000000..93ab709ed --- /dev/null +++ b/regex/som/som_stream.c @@ -0,0 +1,174 @@ +/* + * Copyright (c) 2015, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief SOM streaming runtime code. + * + * Code in this file handles storing and loading SOM slot information from + * stream state. + */ + +#include "scratch.h" +#include "som_stream.h" +#include "rose/rose_internal.h" +#include "util/multibit.h" + +// Sentinel values stored in stream state and used to represent an SOM distance +// that is too far in the past to be stored in the available space in stream +// state. + +#define SOM_SENTINEL_LARGE (~0ull) +#define SOM_SENTINEL_MEDIUM (~0u) +#define SOM_SENTINEL_SMALL ((u16)~0u) + +static really_inline +void storeSomValue(void *stream_som_store, u64a som_value, + u64a stream_offset, u8 som_size) { + // Special case for sentinel value. + if (som_value == SOM_SENTINEL_LARGE) { + switch (som_size) { + case 2: + *(u16 *)stream_som_store = SOM_SENTINEL_SMALL; + break; + case 4: + *(u32 *)stream_som_store = SOM_SENTINEL_MEDIUM; + break; + case 8: + *(u64a *)stream_som_store = SOM_SENTINEL_LARGE; + break; + default: + break; + } + return; + } + + assert(som_value <= stream_offset); + u64a rel_offset = stream_offset - som_value; + DEBUG_PRINTF("rel_offset=%llu\n", rel_offset); + + switch (som_size) { + case 2: + rel_offset = MIN(rel_offset, SOM_SENTINEL_SMALL); + assert(ISALIGNED_N(stream_som_store, alignof(u16))); + *(u16 *)stream_som_store = rel_offset; + break; + case 4: + rel_offset = MIN(rel_offset, SOM_SENTINEL_MEDIUM); + assert(ISALIGNED_N(stream_som_store, alignof(u32))); + *(u32 *)stream_som_store = rel_offset; + break; + case 8: + assert(ISALIGNED_N(stream_som_store, alignof(u64a))); + *(u64a *)stream_som_store = rel_offset; + break; + default: + assert(0); + break; + } +} + +void storeSomToStream(struct hs_scratch *scratch, const u64a offset) { + assert(scratch); + DEBUG_PRINTF("stream offset %llu\n", offset); + + struct core_info *ci = &scratch->core_info; + const struct RoseEngine *rose = ci->rose; + + const u32 som_store_count = rose->somLocationCount; + assert(som_store_count); // Caller should ensure that we have work to do. + + u8 *som_store_valid = (u8 *)ci->state + rose->stateOffsets.somValid; + char *stream_som_store = ci->state + rose->stateOffsets.somLocation; + const u64a *som_store = scratch->som_store; + const u8 som_size = rose->somHorizon; + + for (u32 i = mmbit_iterate(som_store_valid, som_store_count, MMB_INVALID); + i != MMB_INVALID; + i = mmbit_iterate(som_store_valid, som_store_count, i)) { + DEBUG_PRINTF("storing %llu in %u\n", som_store[i], i); + storeSomValue(stream_som_store + (i * som_size), som_store[i], + offset, som_size); + } +} + +static really_inline +u64a loadSomValue(const void *stream_som_store, u64a stream_offset, + u8 som_size) { + u64a rel_offset; + switch (som_size) { + case 2: + assert(ISALIGNED_N(stream_som_store, alignof(u16))); + rel_offset = *(const u16 *)stream_som_store; + if (rel_offset == SOM_SENTINEL_SMALL) { + return SOM_SENTINEL_LARGE; + } + break; + case 4: + assert(ISALIGNED_N(stream_som_store, alignof(u32))); + rel_offset = *(const u32 *)stream_som_store; + if (rel_offset == SOM_SENTINEL_MEDIUM) { + return SOM_SENTINEL_LARGE; + } + break; + case 8: + assert(ISALIGNED_N(stream_som_store, alignof(u64a))); + rel_offset = *(const u64a *)stream_som_store; + break; + default: + assert(0); + rel_offset = 0; + break; + } + + DEBUG_PRINTF("rel_offset=%llu\n", rel_offset); + return stream_offset - rel_offset; +} + +void loadSomFromStream(struct hs_scratch *scratch, const u64a offset) { + assert(scratch); + DEBUG_PRINTF("stream offset %llu\n", offset); + + struct core_info *ci = &scratch->core_info; + const struct RoseEngine *rose = ci->rose; + + const u32 som_store_count = rose->somLocationCount; + assert(som_store_count); // Caller should ensure that we have work to do. + + const u8 *som_store_valid = (u8 *)ci->state + rose->stateOffsets.somValid; + const char *stream_som_store = ci->state + rose->stateOffsets.somLocation; + u64a *som_store = scratch->som_store; + const u8 som_size = rose->somHorizon; + + for (u32 i = mmbit_iterate(som_store_valid, som_store_count, MMB_INVALID); + i != MMB_INVALID; + i = mmbit_iterate(som_store_valid, som_store_count, i)) { + som_store[i] = loadSomValue(stream_som_store + (i*som_size), offset, + som_size); + DEBUG_PRINTF("loaded %llu from %u\n", som_store[i], i); + } +} diff --git a/regex/som/som_stream.h b/regex/som/som_stream.h new file mode 100644 index 000000000..8b62264d1 --- /dev/null +++ b/regex/som/som_stream.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2015, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief SOM streaming runtime code. + */ + +#ifndef SOM_STREAM_H +#define SOM_STREAM_H + +#include "ue2common.h" + +struct hs_scratch; + +/** \brief Write all SOM slot information from scratch out to stream state + * (given the current stream offset). */ +void storeSomToStream(struct hs_scratch *scratch, const u64a offset); + +/** \brief Read all SOM slot information from stream state into scratch (given + * the current stream offset). */ +void loadSomFromStream(struct hs_scratch *scratch, const u64a offset); + +#endif diff --git a/regex/state.h b/regex/state.h new file mode 100644 index 000000000..9ade59db4 --- /dev/null +++ b/regex/state.h @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2015, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Stream state data structures. + */ + +#ifndef STATE_H +#define STATE_H + +#include "hs_runtime.h" /* match_event_handler */ +#include "ue2common.h" + +#ifdef __cplusplus +extern "C" +{ +#endif + +struct RoseEngine; + +/** \brief Stream context: allocated for each stream. + * + * struct hs_stream is followed in memory by the main Rose state: history, + * exhaustion, individual states, etc. The RoseEngine has the offsets required + * to correctly index into the main state structure. The offsets used by the + * RoseEngine are based on the end of the hs_stream struct as its size may + * vary from platform to platform. + */ +struct hs_stream { + /** \brief The RoseEngine that this stream is matching against. */ + const struct RoseEngine *rose; + + /** \brief The current stream offset. */ + u64a offset; +}; + +#define getMultiState(hs_s) ((char *)(hs_s) + sizeof(*(hs_s))) +#define getMultiStateConst(hs_s) ((const char *)(hs_s) + sizeof(*(hs_s))) + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/regex/stream_compress.c b/regex/stream_compress.c new file mode 100644 index 000000000..1f7b01e82 --- /dev/null +++ b/regex/stream_compress.c @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "stream_compress.h" + +#include "state.h" +#include "nfa/nfa_internal.h" +#include "rose/rose_internal.h" +#include "util/multibit.h" +#include "util/multibit_compress.h" +#include "util/uniform_ops.h" + +#ifndef __KERNEL__ +#include +#else +#include +#endif + +#define COPY_IN(p, sz) do { \ + assert(currOffset + sz <= buf_size); \ + memcpy(buf + currOffset, p, sz); \ + currOffset += sz; \ + DEBUG_PRINTF("co = %zu\n", currOffset); \ + } while (0); + +#define COPY_OUT(p, sz) do { \ + if (currOffset + sz > buf_size) { \ + return 0; \ + } \ + memcpy(p, buf + currOffset, sz); \ + currOffset += sz; \ + DEBUG_PRINTF("co = %zu\n", currOffset); \ + } while (0); + +#define SIZE_COPY_IN(p, sz) do { \ + currOffset += sz; \ + DEBUG_PRINTF("co = %zu\n", currOffset); \ + } while (0); + +#define COPY_MULTIBIT_IN(p, total_bits) do { \ + size_t sz; \ + STREAM_QUAL u8 *bits = (STREAM_QUAL u8 *)p; \ + BUF_QUAL u8 *comp = (BUF_QUAL u8 *)(buf + currOffset); \ + if (!mmbit_compress(bits, total_bits, comp, &sz, \ + buf_size - currOffset)) { \ + return 0; /* error */ \ + } \ + currOffset += sz; \ + DEBUG_PRINTF("co = %zu\n", currOffset); \ + } while (0); + +#define COPY_MULTIBIT_OUT(p, total_bits) do { \ + size_t sz; \ + STREAM_QUAL u8 *bits = (STREAM_QUAL u8 *)p; \ + BUF_QUAL u8 *comp = (BUF_QUAL u8 *)(buf + currOffset); \ + if (!mmbit_decompress(bits, total_bits, comp, &sz, \ + buf_size - currOffset)) { \ + return 0; /* error */ \ + } \ + currOffset += sz; \ + DEBUG_PRINTF("co = %zu\n", currOffset); \ + } while (0); + +#define COPY_MULTIBIT_SIZE(p, total_bits) do { \ + STREAM_QUAL u8 *bits = (STREAM_QUAL u8 *)p; \ + size_t sz = mmbit_compsize(bits, total_bits); \ + currOffset += sz; \ + DEBUG_PRINTF("co = %zu\n", currOffset); \ + } while (0); + +#define COPY COPY_OUT +#define COPY_MULTIBIT COPY_MULTIBIT_OUT +#define ASSIGN(lhs, rhs) do { lhs = rhs; } while (0) +#define FN_SUFFIX expand +#define STREAM_QUAL +#define BUF_QUAL const +#include "stream_compress_impl.h" + +int expand_stream(struct hs_stream *stream, const struct RoseEngine *rose, + const char *buf, size_t buf_size) { + return sc_expand(rose, stream, buf, buf_size); +} + +#define COPY COPY_IN +#define COPY_MULTIBIT COPY_MULTIBIT_IN +#define ASSIGN(lhs, rhs) do { } while (0) +#define FN_SUFFIX compress +#define STREAM_QUAL const +#define BUF_QUAL +#include "stream_compress_impl.h" + +size_t compress_stream(char *buf, size_t buf_size, + const struct RoseEngine *rose, + const struct hs_stream *stream) { + return sc_compress(rose, stream, buf, buf_size); +} + +#define COPY SIZE_COPY_IN +#define COPY_MULTIBIT COPY_MULTIBIT_SIZE +#define ASSIGN(lhs, rhs) do { } while (0) +#define FN_SUFFIX size +#define STREAM_QUAL const +#define BUF_QUAL UNUSED +#include "stream_compress_impl.h" + +size_t size_compress_stream(const struct RoseEngine *rose, + const struct hs_stream *stream) { + return sc_size(rose, stream, NULL, 0); +} diff --git a/regex/stream_compress.h b/regex/stream_compress.h new file mode 100644 index 000000000..fb2e5cade --- /dev/null +++ b/regex/stream_compress.h @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Functions for dynamic compress/expand for streams. + */ + +#ifndef STREAM_COMPRESS_H +#define STREAM_COMPRESS_H + +#ifndef __KERNEL__ +#include +#else +#include +#endif + +struct hs_stream; +struct RoseEngine; + +int expand_stream(struct hs_stream *out, const struct RoseEngine *rose, + const char *buf, size_t buf_size); + +size_t compress_stream(char *buf, size_t buf_size, + const struct RoseEngine *rose, + const struct hs_stream *src); + +size_t size_compress_stream(const struct RoseEngine *rose, + const struct hs_stream *stream); + +#endif diff --git a/regex/stream_compress_impl.h b/regex/stream_compress_impl.h new file mode 100644 index 000000000..d1ccf5e6d --- /dev/null +++ b/regex/stream_compress_impl.h @@ -0,0 +1,193 @@ +/* + * Copyright (c) 2017-2018, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "util/join.h" + +#define COPY_FIELD(x) COPY(&x, sizeof(x)) +#define COPY_LEFTFIXES JOIN(sc_left_, FN_SUFFIX) +#define COPY_SOM_INFO JOIN(sc_som_, FN_SUFFIX) + +static +size_t COPY_LEFTFIXES(const struct RoseEngine *rose, size_t currOffset, + STREAM_QUAL struct hs_stream *stream, + BUF_QUAL char *buf, UNUSED size_t buf_size) { + if (!rose->activeLeftIterOffset) { + return currOffset; + } + + const struct RoseStateOffsets *so = &rose->stateOffsets; + STREAM_QUAL char *stream_body + = ((STREAM_QUAL char *)stream) + sizeof(struct hs_stream); + + /* Note: in the expand case the active left array has already been copied + * into the stream. */ + const u8 *ara = (const u8 *)(stream_body + so->activeLeftArray); + const u32 arCount = rose->activeLeftCount; + const struct LeftNfaInfo *left_table = getLeftTable(rose); + + /* We only want to look at non-transient leftfixes */ + const struct mmbit_sparse_iter *it = getActiveLeftIter(rose); + struct mmbit_sparse_state si_state[MAX_SPARSE_ITER_STATES]; + u32 dummy; + u32 ri = mmbit_sparse_iter_begin(ara, arCount, &dummy, it, si_state); + for (; ri != MMB_INVALID; + ri = mmbit_sparse_iter_next(ara, arCount, ri, &dummy, it, si_state)) { + u32 qi = ri + rose->leftfixBeginQueue; + UNUSED const struct LeftNfaInfo *left = left_table + ri; + const struct NfaInfo *nfa_info = getNfaInfoByQueue(rose, qi); + const struct NFA *nfa = getNfaByInfo(rose, nfa_info); + + COPY(stream_body + nfa_info->stateOffset, nfa->streamStateSize); + /* copy the one whole byte for active leftfixes as well */ + assert(left->lagIndex != ROSE_OFFSET_INVALID); + COPY(stream_body + so->leftfixLagTable + left->lagIndex, 1); + } + + return currOffset; +} + +static +size_t COPY_SOM_INFO(const struct RoseEngine *rose, size_t currOffset, + STREAM_QUAL struct hs_stream *stream, + BUF_QUAL char *buf, UNUSED size_t buf_size) { + const struct RoseStateOffsets *so = &rose->stateOffsets; + + if (!so->somLocation) { + assert(!so->somValid); + assert(!so->somWritable); + return currOffset; + } + + STREAM_QUAL char *stream_body + = ((STREAM_QUAL char *)stream) + sizeof(struct hs_stream); + + assert(so->somValid); + assert(so->somWritable); + + COPY_MULTIBIT(stream_body + so->somWritable, rose->somLocationCount); + COPY_MULTIBIT(stream_body + so->somValid, rose->somLocationCount); + + /* Copy only the som slots which contain valid values. */ + /* Note: in the expand case the som valid array has been copied in. */ + const u8 *svalid = (const u8 *)(stream_body + so->somValid); + u32 s_count = rose->somLocationCount; + u32 s_width = rose->somHorizon; + for (u32 slot = mmbit_iterate(svalid, s_count, MMB_INVALID); + slot != MMB_INVALID; slot = mmbit_iterate(svalid, s_count, slot)) { + COPY(stream_body + so->somLocation + slot * s_width, s_width); + } + + return currOffset; +} + +static +size_t JOIN(sc_, FN_SUFFIX)(const struct RoseEngine *rose, + STREAM_QUAL struct hs_stream *stream, + BUF_QUAL char *buf, UNUSED size_t buf_size) { + size_t currOffset = 0; + const struct RoseStateOffsets *so = &rose->stateOffsets; + + STREAM_QUAL char *stream_body + = ((STREAM_QUAL char *)stream) + sizeof(struct hs_stream); + + COPY_FIELD(stream->offset); + ASSIGN(stream->rose, rose); + + COPY(stream_body + ROSE_STATE_OFFSET_STATUS_FLAGS, 1); + COPY_MULTIBIT(stream_body + ROSE_STATE_OFFSET_ROLE_MMBIT, rose->rolesWithStateCount); + + /* stream is valid in compress/size, and stream->offset has been set already + * on the expand side */ + u64a offset = stream->offset; + u32 history = MIN((u32)offset, rose->historyRequired); + + /* copy the active mmbits */ + COPY_MULTIBIT(stream_body + so->activeLeafArray, rose->activeArrayCount); + COPY_MULTIBIT(stream_body + so->activeLeftArray, rose->activeLeftCount); + + COPY(stream_body + so->longLitState, so->longLitState_size); + + /* Leftlag table will be handled later, for active leftfixes */ + + /* anchored table state is not required once we are deep in the stream */ + if (offset <= rose->anchoredDistance) { + COPY(stream_body + so->anchorState, rose->anchorStateSize); + } + + COPY(stream_body + so->groups, so->groups_size); + + /* copy the real bits of history */ + UNUSED u32 hend = so->history + rose->historyRequired; + COPY(stream_body + hend - history, history); + + /* copy the exhaustion multibit */ + COPY_MULTIBIT(stream_body + so->exhausted, rose->ekeyCount); + + /* copy the logical multibit */ + COPY_MULTIBIT(stream_body + so->logicalVec, + rose->lkeyCount + rose->lopCount); + + /* copy the combination multibit */ + COPY_MULTIBIT(stream_body + so->combVec, rose->ckeyCount); + + /* copy nfa stream state for endfixes */ + /* Note: in the expand case the active array has already been copied into + * the stream. */ + const u8 *aa = (const u8 *)(stream_body + so->activeLeafArray); + u32 aaCount = rose->activeArrayCount; + for (u32 qi = mmbit_iterate(aa, aaCount, MMB_INVALID); qi != MMB_INVALID; + qi = mmbit_iterate(aa, aaCount, qi)) { + DEBUG_PRINTF("saving stream state for qi=%u\n", qi); + const struct NfaInfo *nfa_info = getNfaInfoByQueue(rose, qi); + const struct NFA *nfa = getNfaByInfo(rose, nfa_info); + COPY(stream_body + nfa_info->stateOffset, nfa->streamStateSize); + } + + /* copy nfa stream state for leftfixes */ + currOffset = COPY_LEFTFIXES(rose, currOffset, stream, buf, buf_size); + if (!currOffset) { + return 0; + } + + currOffset = COPY_SOM_INFO(rose, currOffset, stream, buf, buf_size); + if (!currOffset) { + return 0; + } + + return currOffset; +} + +#undef ASSIGN +#undef COPY +#undef COPY_FIELD +#undef COPT_LEFTFIXES +#undef COPY_MULTIBIT +#undef COPY_SOM_INFO +#undef FN_SUFFIX +#undef BUF_QUAL +#undef STREAM_QUAL diff --git a/regex/ue2common.h b/regex/ue2common.h new file mode 100644 index 000000000..7b471c8ee --- /dev/null +++ b/regex/ue2common.h @@ -0,0 +1,247 @@ +/* + * Copyright (c) 2015-2018, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Core UE2 global types, defines, utilities. + * + * NOTE WELL: this file is included into both C and C++ source code, so + * be sure to remain compatible with both. + */ + +#ifndef UE2COMMON_H +#define UE2COMMON_H + +#include "config.h" + +#ifndef __KERNEL__ + +/* standard types used across ue2 */ + +// We use the size_t type all over the place, usually defined in stddef.h. +#include +// stdint.h for things like uintptr_t and friends +#include + +#if defined(__cplusplus) +# define FALLTHROUGH [[fallthrough]] +#elif !defined(_WIN32) && __has_attribute(__fallthrough__) +# define FALLTHROUGH __attribute__((__fallthrough__)) +#else +# define FALLTHROUGH do {} while (0) /* fallthrough */ +#endif + +/* ick */ +#if defined(_WIN32) +#define ALIGN_ATTR(x) __declspec(align(x)) +#else +#define ALIGN_ATTR(x) __attribute__((aligned((x)))) +#endif + +#define ALIGN_DIRECTIVE ALIGN_ATTR(16) +#define ALIGN_AVX_DIRECTIVE ALIGN_ATTR(32) +#define ALIGN_CL_DIRECTIVE ALIGN_ATTR(64) + +typedef signed char s8; +typedef unsigned char u8; +typedef signed short s16; +typedef unsigned short u16; +typedef unsigned int u32; +typedef signed int s32; + +/* We append the 'a' for aligned, since these aren't common, garden variety + * 64 bit values. The alignment is necessary for structs on some platforms, + * so we don't end up performing accidental unaligned accesses. */ +#if defined(_WIN32) && ! defined(_WIN64) +typedef unsigned long long ALIGN_ATTR(4) u64a; +typedef signed long long ALIGN_ATTR(4) s64a; +#else +typedef unsigned long long ALIGN_ATTR(8) u64a; +typedef signed long long ALIGN_ATTR(8) s64a; +#endif + +/* get the SIMD types */ +#include "util/simd_types.h" + +/** \brief Report identifier, used for internal IDs and external IDs (those + * reported on match). */ +typedef u32 ReportID; + +/* Shorthand for attribute to mark a function as part of our public API. + * Functions without this attribute will be hidden. */ +#if !defined(_WIN32) +#define HS_PUBLIC_API __attribute__((visibility("default"))) +#else +// TODO: dllexport defines for windows +#define HS_PUBLIC_API +#endif + +#define ARRAY_LENGTH(a) (sizeof(a)/sizeof((a)[0])) + +/** \brief Shorthand for the attribute to shut gcc about unused parameters */ +#if !defined(_WIN32) +#define UNUSED __attribute__ ((unused)) +#else +#define UNUSED +#endif + +/* really_inline forces inlining always */ +#if !defined(_WIN32) +#if defined(HS_OPTIMIZE) +#define really_inline inline __attribute__ ((always_inline, unused)) +#else +#define really_inline __attribute__ ((unused)) +#endif + +/** no, seriously, inline it, even if building in debug mode */ +#define really_really_inline inline __attribute__ ((always_inline, unused)) +#define never_inline __attribute__ ((noinline)) +#define alignof __alignof +#define HAVE_TYPEOF 1 + +#else // ms windows +#define really_inline __forceinline +#define really_really_inline __forceinline +#define never_inline +#define __builtin_prefetch(...) do {} while(0) +#if defined(__cplusplus) +#define __typeof__ decltype +#define HAVE_TYPEOF 1 +#else // C +/* msvc doesn't have decltype or typeof in C */ +#define inline __inline +#define alignof __alignof +#endif +#endif + + +// We use C99-style "restrict". +#ifdef _WIN32 +#ifdef __cplusplus +#define restrict +#else +#define restrict __restrict +#endif +#else +#define restrict __restrict +#endif + + +// Align to 16-byte boundary +#define ROUNDUP_16(a) (((a) + 0xf) & ~0xf) +#define ROUNDDOWN_16(a) ((a) & ~0xf) + +// Align to N-byte boundary +#define ROUNDUP_N(a, n) (((a) + ((n)-1)) & ~((n)-1)) +#define ROUNDDOWN_N(a, n) ((a) & ~((n)-1)) + +// Align to a cacheline - assumed to be 64 bytes +#define ROUNDUP_CL(a) ROUNDUP_N(a, 64) + +// Align ptr to next N-byte boundary +#if defined(HAVE_TYPEOF) +#define ROUNDUP_PTR(ptr, n) (__typeof__(ptr))(ROUNDUP_N((uintptr_t)(ptr), (n))) +#define ROUNDDOWN_PTR(ptr, n) (__typeof__(ptr))(ROUNDDOWN_N((uintptr_t)(ptr), (n))) +#else +#define ROUNDUP_PTR(ptr, n) (void*)(ROUNDUP_N((uintptr_t)(ptr), (n))) +#define ROUNDDOWN_PTR(ptr, n) (void*)(ROUNDDOWN_N((uintptr_t)(ptr), (n))) +#endif + +#define ISALIGNED_N(ptr, n) (((uintptr_t)(ptr) & ((n) - 1)) == 0) +#define ISALIGNED_16(ptr) ISALIGNED_N((ptr), 16) +#define ISALIGNED_CL(ptr) ISALIGNED_N((ptr), 64) +#if defined(HAVE_TYPEOF) +#define ISALIGNED(ptr) ISALIGNED_N((ptr), alignof(__typeof__(*(ptr)))) +#else +/* we should probably avoid using this test in C */ +#define ISALIGNED(ptr) (1) +#endif +#define N_CHARS 256 + +// Maximum offset representable in the 'unsigned long long' we use to return +// offset values. +#define MAX_OFFSET 0xffffffffffffffffULL + +#if !defined(MIN) + #define MIN(a,b) ((a) < (b) ? (a) : (b)) +#endif +#if !defined(MAX) + #define MAX(a,b) ((a) > (b) ? (a) : (b)) +#endif + +#define LIMIT_TO_AT_MOST(a, b) (*(a) = MIN(*(a),(b))) +#define ENSURE_AT_LEAST(a, b) (*(a) = MAX(*(a),(b))) + +#ifndef _WIN32 +#ifndef likely + #define likely(x) __builtin_expect(!!(x), 1) +#endif +#ifndef unlikely + #define unlikely(x) __builtin_expect(!!(x), 0) +#endif +#else +#define likely(x) (x) +#define unlikely(x) (x) +#endif + +#if !defined(RELEASE_BUILD) || defined(DEBUG) +#ifdef _WIN32 +#define PATH_SEP '\\' +#else +#define PATH_SEP '/' +#endif +#endif + +#if defined(DEBUG) && !defined(DEBUG_PRINTF) +#include +#include +#define DEBUG_PRINTF(format, ...) printf("%s:%s:%d:" format, \ + strrchr(__FILE__, PATH_SEP) + 1, \ + __func__, __LINE__, ## __VA_ARGS__) +#elif !defined(DEBUG_PRINTF) +#define DEBUG_PRINTF(format, ...) pr_notice("%s:%s:%d:" format, \ + strrchr(__FILE__, PATH_SEP) + 1, \ + __func__, __LINE__, ## __VA_ARGS__) +#endif + +#if !defined(RELEASE_BUILD) +#include +#include +#define ADEBUG_PRINTF(format, ...) printf("!%s:%s:%d:" format, \ + strrchr(__FILE__, PATH_SEP) + 1, \ + __func__, __LINE__, ## __VA_ARGS__) +#else +#define ADEBUG_PRINTF(format, ...) do { } while(0) +#endif + +#include + +#else +#include "ue2common_kern.h" +#endif + +#endif diff --git a/regex/util/arch.h b/regex/util/arch.h new file mode 100644 index 000000000..782ad5b2e --- /dev/null +++ b/regex/util/arch.h @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2017-2020, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Per-platform architecture definitions + */ + +#ifndef UTIL_ARCH_H_ +#define UTIL_ARCH_H_ + +#if !defined(__KERNEL__) +#if defined(__SSE2__) || defined(_M_X64) || (_M_IX86_FP >= 2) +#define HAVE_SSE2 +#endif + +#if defined(__SSE4_1__) || (defined(_WIN32) && defined(__AVX__)) +#define HAVE_SSE41 +#endif + +#if defined(__SSE4_2__) || (defined(_WIN32) && defined(__AVX__)) +#define HAVE_SSE42 +#endif + +#if defined(__AVX__) +#define HAVE_AVX +#endif + +#if defined(__AVX2__) +#define HAVE_AVX2 +#endif + +#if defined(__AVX512BW__) && defined(BUILD_AVX512) +#define HAVE_AVX512 +#endif + +#if defined(__AVX512VBMI__) && defined(BUILD_AVX512_VBMI) +#define HAVE_AVX512VBMI +#endif +#endif /* __KERNEL__ */ + +/* + * ICC and MSVC don't break out POPCNT or BMI/2 as separate pre-def macros + */ +#if defined(__POPCNT__) || \ + (defined(__INTEL_COMPILER) && defined(__SSE4_2__)) || \ + (defined(_WIN32) && defined(__AVX__)) +#define HAVE_POPCOUNT_INSTR +#endif + +#if defined(__BMI__) || (defined(_WIN32) && defined(__AVX2__)) || \ + (defined(__INTEL_COMPILER) && defined(__AVX2__)) +#define HAVE_BMI +#endif + +#if defined(__BMI2__) || (defined(_WIN32) && defined(__AVX2__)) || \ + (defined(__INTEL_COMPILER) && defined(__AVX2__)) +#define HAVE_BMI2 +#endif + +/* + * MSVC uses a different form of inline asm + */ +#if defined(_WIN32) && defined(_MSC_VER) +#define NO_ASM +#endif + +#endif // UTIL_ARCH_H_ diff --git a/regex/util/bitutils.h b/regex/util/bitutils.h new file mode 100644 index 000000000..c545ee187 --- /dev/null +++ b/regex/util/bitutils.h @@ -0,0 +1,492 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Bit-twiddling primitives (ctz, compress etc) + */ + +#ifndef BITUTILS_H +#define BITUTILS_H + +#include "ue2common.h" +#include "popcount.h" +#include "util/arch.h" +#include "util/intrinsics.h" + +#define CASE_BIT 0x20 +#define CASE_CLEAR 0xdf +#define DOUBLE_CASE_CLEAR 0xdfdf +#define OCTO_CASE_CLEAR 0xdfdfdfdfdfdfdfdfULL + +static really_inline +u32 clz32(u32 x) { + assert(x); // behaviour not defined for x == 0 +#if defined(_WIN32) + unsigned long r; + _BitScanReverse(&r, x); + return 31 - r; +#else + return (u32)__builtin_clz(x); +#endif +} + +static really_inline +u32 clz64(u64a x) { + assert(x); // behaviour not defined for x == 0 +#if defined(_WIN64) + unsigned long r; + _BitScanReverse64(&r, x); + return 63 - r; +#elif defined(_WIN32) + unsigned long x1 = (u32)x; + unsigned long x2 = (u32)(x >> 32); + unsigned long r; + if (x2) { + _BitScanReverse(&r, x2); + return (u32)(31 - r); + } + _BitScanReverse(&r, (u32)x1); + return (u32)(63 - r); +#else + return (u32)__builtin_clzll(x); +#endif +} + +// CTZ (count trailing zero) implementations. +static really_inline +u32 ctz32(u32 x) { + assert(x); // behaviour not defined for x == 0 +#if defined(_WIN32) + unsigned long r; + _BitScanForward(&r, x); + return r; +#else + return (u32)__builtin_ctz(x); +#endif +} + +static really_inline +u32 ctz64(u64a x) { + assert(x); // behaviour not defined for x == 0 +#if defined(_WIN64) + unsigned long r; + _BitScanForward64(&r, x); + return r; +#elif defined(_WIN32) + unsigned long r; + if (_BitScanForward(&r, (u32)x)) { + return (u32)r; + } + _BitScanForward(&r, x >> 32); + return (u32)(r + 32); +#else + return (u32)__builtin_ctzll(x); +#endif +} + +static really_inline +u32 lg2(u32 x) { + if (!x) { + return 0; + } + return 31 - clz32(x); +} + +static really_inline +u64a lg2_64(u64a x) { + if (!x) { + return 0; + } + return 63 - clz64(x); +} + +static really_inline +u32 findAndClearLSB_32(u32 *v) { + assert(*v != 0); // behaviour not defined in this case +#ifndef NO_ASM + u32 val = *v, offset; + __asm__ ("bsf %1, %0\n" + "btr %0, %1\n" + : "=r" (offset), "=r" (val) + : "1" (val)); + *v = val; +#else + u32 val = *v; + u32 offset = ctz32(val); + *v = val & (val - 1); +#endif + + assert(offset < 32); + return offset; +} + +static really_inline +u32 findAndClearLSB_64(u64a *v) { + assert(*v != 0); // behaviour not defined in this case + +#ifdef ARCH_64_BIT +#if defined(ARCH_X86_64) && !defined(NO_ASM) + u64a val = *v, offset; + __asm__ ("bsfq %1, %0\n" + "btrq %0, %1\n" + : "=r" (offset), "=r" (val) + : "1" (val)); + *v = val; +#else + // generic variant using gcc's builtin on 64-bit + u64a val = *v, offset; + offset = ctz64(val); + *v = val & (val - 1); +#endif // ARCH_X86_64 +#else + // fall back to doing things with two 32-bit cases, since gcc-4.1 doesn't + // inline calls to __builtin_ctzll + u32 v1 = (u32)*v; + u32 v2 = (u32)(*v >> 32); + u32 offset; + if (v1) { + offset = findAndClearLSB_32(&v1); + *v = (u64a)v1 | ((u64a)v2 << 32); + } else { + offset = findAndClearLSB_32(&v2) + 32; + *v = (u64a)v2 << 32; + } +#endif + + assert(offset < 64); + return (u32)offset; +} + +static really_inline +u32 findAndClearMSB_32(u32 *v) { + assert(*v != 0); // behaviour not defined in this case +#ifndef NO_ASM + u32 val = *v, offset; + __asm__ ("bsr %1, %0\n" + "btr %0, %1\n" + : "=r" (offset), "=r" (val) + : "1" (val)); + *v = val; +#else + u32 val = *v; + u32 offset = 31 - clz32(val); + *v = val & ~(1 << offset); +#endif + assert(offset < 32); + return offset; +} + +static really_inline +u32 findAndClearMSB_64(u64a *v) { + assert(*v != 0); // behaviour not defined in this case + +#ifdef ARCH_64_BIT +#if defined(ARCH_X86_64) && !defined(NO_ASM) + u64a val = *v, offset; + __asm__ ("bsrq %1, %0\n" + "btrq %0, %1\n" + : "=r" (offset), "=r" (val) + : "1" (val)); + *v = val; +#else + // generic variant using gcc's builtin on 64-bit + u64a val = *v, offset; + offset = 63 - clz64(val); + *v = val & ~(1ULL << offset); +#endif // ARCH_X86_64 +#else + // fall back to doing things with two 32-bit cases, since gcc-4.1 doesn't + // inline calls to __builtin_ctzll + u32 v1 = (u32)*v; + u32 v2 = (*v >> 32); + u32 offset; + if (v2) { + offset = findAndClearMSB_32(&v2) + 32; + *v = ((u64a)v2 << 32) | (u64a)v1; + } else { + offset = findAndClearMSB_32(&v1); + *v = (u64a)v1; + } +#endif + + assert(offset < 64); + return (u32)offset; +} + +static really_inline +u32 compress32(u32 x, u32 m) { +#if defined(HAVE_BMI2) + // BMI2 has a single instruction for this operation. + return _pext_u32(x, m); +#else + + // Return zero quickly on trivial cases + if ((x & m) == 0) { + return 0; + } + + u32 mk, mp, mv, t; + + x &= m; // clear irrelevant bits + + mk = ~m << 1; // we will count 0's to right + for (u32 i = 0; i < 5; i++) { + mp = mk ^ (mk << 1); + mp ^= mp << 2; + mp ^= mp << 4; + mp ^= mp << 8; + mp ^= mp << 16; + + mv = mp & m; // bits to move + m = (m ^ mv) | (mv >> (1 << i)); // compress m + t = x & mv; + x = (x ^ t) | (t >> (1 << i)); // compress x + mk = mk & ~mp; + } + + return x; +#endif +} + +static really_inline +u64a compress64(u64a x, u64a m) { +#if defined(ARCH_X86_64) && defined(HAVE_BMI2) + // BMI2 has a single instruction for this operation. + return _pext_u64(x, m); +#else + + // Return zero quickly on trivial cases + if ((x & m) == 0) { + return 0; + } + + u64a mk, mp, mv, t; + + x &= m; // clear irrelevant bits + + mk = ~m << 1; // we will count 0's to right + for (u32 i = 0; i < 6; i++) { + mp = mk ^ (mk << 1); + mp ^= mp << 2; + mp ^= mp << 4; + mp ^= mp << 8; + mp ^= mp << 16; + mp ^= mp << 32; + + mv = mp & m; // bits to move + m = (m ^ mv) | (mv >> (1 << i)); // compress m + t = x & mv; + x = (x ^ t) | (t >> (1 << i)); // compress x + mk = mk & ~mp; + } + + return x; +#endif +} + +static really_inline +u32 expand32(u32 x, u32 m) { +#if defined(HAVE_BMI2) + // BMI2 has a single instruction for this operation. + return _pdep_u32(x, m); +#else + + // Return zero quickly on trivial cases + if (!x || !m) { + return 0; + } + + u32 m0, mk, mp, mv, t; + u32 array[5]; + + m0 = m; // save original mask + mk = ~m << 1; // we will count 0's to right + + for (int i = 0; i < 5; i++) { + mp = mk ^ (mk << 1); // parallel suffix + mp = mp ^ (mp << 2); + mp = mp ^ (mp << 4); + mp = mp ^ (mp << 8); + mp = mp ^ (mp << 16); + mv = mp & m; // bits to move + array[i] = mv; + m = (m ^ mv) | (mv >> (1 << i)); // compress m + mk = mk & ~mp; + } + + for (int i = 4; i >= 0; i--) { + mv = array[i]; + t = x << (1 << i); + x = (x & ~mv) | (t & mv); + } + + return x & m0; // clear out extraneous bits +#endif +} + +static really_inline +u64a expand64(u64a x, u64a m) { +#if defined(ARCH_X86_64) && defined(HAVE_BMI2) + // BMI2 has a single instruction for this operation. + return _pdep_u64(x, m); +#else + + // Return zero quickly on trivial cases + if (!x || !m) { + return 0; + } + + u64a m0, mk, mp, mv, t; + u64a array[6]; + + m0 = m; // save original mask + mk = ~m << 1; // we will count 0's to right + + for (int i = 0; i < 6; i++) { + mp = mk ^ (mk << 1); // parallel suffix + mp = mp ^ (mp << 2); + mp = mp ^ (mp << 4); + mp = mp ^ (mp << 8); + mp = mp ^ (mp << 16); + mp = mp ^ (mp << 32); + mv = mp & m; // bits to move + array[i] = mv; + m = (m ^ mv) | (mv >> (1 << i)); // compress m + mk = mk & ~mp; + } + + for (int i = 5; i >= 0; i--) { + mv = array[i]; + t = x << (1 << i); + x = (x & ~mv) | (t & mv); + } + + return x & m0; // clear out extraneous bits +#endif +} + + +/* returns the first set bit after begin (if not ~0U). If no bit is set after + * begin returns ~0U + */ +static really_inline +u32 bf64_iterate(u64a bitfield, u32 begin) { + if (begin != ~0U) { + /* switch off all bits at or below begin. Note: not legal to shift by + * by size of the datatype or larger. */ + assert(begin <= 63); + bitfield &= ~((2ULL << begin) - 1); + } + + if (!bitfield) { + return ~0U; + } + + return ctz64(bitfield); +} + +static really_inline +char bf64_set(u64a *bitfield, u32 i) { + assert(i < 64); + u64a mask = 1ULL << i; + char was_set = !!(*bitfield & mask); + *bitfield |= mask; + + return was_set; +} + +static really_inline +void bf64_unset(u64a *bitfield, u32 i) { + assert(i < 64); + *bitfield &= ~(1ULL << i); +} + +static really_inline +u32 rank_in_mask32(u32 mask, u32 bit) { + assert(bit < sizeof(u32) * 8); + assert(mask & (u32)(1U << bit)); + mask &= (u32)(1U << bit) - 1; + return popcount32(mask); +} + +static really_inline +u32 rank_in_mask64(u64a mask, u32 bit) { + assert(bit < sizeof(u64a) * 8); + assert(mask & (u64a)(1ULL << bit)); + mask &= (u64a)(1ULL << bit) - 1; + return popcount64(mask); +} + +static really_inline +u32 pext32(u32 x, u32 mask) { +#if defined(HAVE_BMI2) + // Intel BMI2 can do this operation in one instruction. + return _pext_u32(x, mask); +#else + + u32 result = 0, num = 1; + while (mask != 0) { + u32 bit = findAndClearLSB_32(&mask); + if (x & (1U << bit)) { + assert(num != 0); // more than 32 bits! + result |= num; + } + num <<= 1; + } + return result; +#endif +} + +static really_inline +u64a pext64(u64a x, u64a mask) { +#if defined(HAVE_BMI2) && defined(ARCH_64_BIT) + // Intel BMI2 can do this operation in one instruction. + return _pext_u64(x, mask); +#else + + u32 result = 0, num = 1; + while (mask != 0) { + u32 bit = findAndClearLSB_64(&mask); + if (x & (1ULL << bit)) { + assert(num != 0); // more than 32 bits! + result |= num; + } + num <<= 1; + } + return result; +#endif +} + +#if defined(HAVE_BMI2) && defined(ARCH_64_BIT) +static really_inline +u64a pdep64(u64a x, u64a mask) { + return _pdep_u64(x, mask); +} +#endif + +#endif // BITUTILS_H diff --git a/regex/util/compare.h b/regex/util/compare.h new file mode 100644 index 000000000..eaa717a4c --- /dev/null +++ b/regex/util/compare.h @@ -0,0 +1,183 @@ +/* + * Copyright (c) 2015-2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef COMPARE_H +#define COMPARE_H + +#include "unaligned.h" +#include "ue2common.h" + +/* Our own definitions of tolower, toupper and isalpha are provided to prevent + * us from going out to libc for these tests. */ + +static really_inline +char myisupper(const char c) { + return ((c >= 'A') && (c <= 'Z')); +} + +static really_inline +char myislower(const char c) { + return ((c >= 'a') && (c <= 'z')); +} + +static really_inline +char mytolower(const char c) { + if (myisupper(c)) { + return c + 0x20; + } + return c; +} + +static really_inline +char mytoupper(const char c) { + if (myislower(c)) { + return c - 0x20; + } + return c; +} + +/* this is a slightly warped definition of `alpha'. What we really + * mean is: does this character have different uppercase and lowercase forms? + */ +static really_inline char ourisalpha(const char c) { + return mytolower(c) != mytoupper(c); +} + +static really_inline char ourisprint(const char c) { + return c >= 0x20 && c <= 0x7e; +} + +// Paul Hsieh's SWAR toupper; used because it doesn't +// matter whether we go toupper or tolower. We should +// probably change the other one +static really_inline +u32 theirtoupper32(const u32 x) { + u32 b = 0x80808080ul | x; + u32 c = b - 0x61616161ul; + u32 d = ~(b - 0x7b7b7b7bul); + u32 e = (c & d) & (~x & 0x80808080ul); + return x - (e >> 2); +} + +// 64-bit variant. +static really_inline +u64a theirtoupper64(const u64a x) { + u64a b = 0x8080808080808080ull | x; + u64a c = b - 0x6161616161616161ull; + u64a d = ~(b - 0x7b7b7b7b7b7b7b7bull); + u64a e = (c & d) & (~x & 0x8080808080808080ull); + u64a v = x - (e >> 2); + return v; +} + +static really_inline +int cmpNocaseNaive(const u8 *p1, const u8 *p2, size_t len) { + const u8 *pEnd = p1 + len; + for (; p1 < pEnd; p1++, p2++) { + assert(!ourisalpha(*p2) || myisupper(*p2)); // Already upper-case. + if ((u8)mytoupper(*p1) != *p2) { + return 1; + } + } + return 0; +} + +static really_inline +int cmpCaseNaive(const u8 *p1, const u8 *p2, size_t len) { + const u8 *pEnd = p1 + len; + for (; p1 < pEnd; p1++, p2++) { + if (*p1 != *p2) { + return 1; + } + } + return 0; +} + +#ifdef ARCH_64_BIT +# define CMP_T u64a +# define ULOAD(x) unaligned_load_u64a(x) +# define TOUPPER(x) theirtoupper64(x) +#else +# define CMP_T u32 +# define ULOAD(x) unaligned_load_u32(x) +# define TOUPPER(x) theirtoupper32(x) +#endif + +#define CMP_SIZE sizeof(CMP_T) + +/** + * \brief Compare two strings, optionally caselessly. + * + * Note: If nocase is true, p2 is assumed to be already upper-case. + */ +#if defined(ARCH_IA32) +static UNUSED never_inline +#else +static really_inline +#endif +int cmpForward(const u8 *p1, const u8 *p2, size_t len, char nocase) { + if (len < CMP_SIZE) { + return nocase ? cmpNocaseNaive(p1, p2, len) + : cmpCaseNaive(p1, p2, len); + } + + const u8 *p1_end = p1 + len - CMP_SIZE; + const u8 *p2_end = p2 + len - CMP_SIZE; + + if (nocase) { // Case-insensitive version. + for (; p1 < p1_end; p1 += CMP_SIZE, p2 += CMP_SIZE) { + assert(ULOAD(p2) == TOUPPER(ULOAD(p2))); // Already upper-case. + if (TOUPPER(ULOAD(p1)) != ULOAD(p2)) { + return 1; + } + } + assert(ULOAD(p2_end) == TOUPPER(ULOAD(p2_end))); // Already upper-case. + if (TOUPPER(ULOAD(p1_end)) != ULOAD(p2_end)) { + return 1; + } + } else { // Case-sensitive version. + for (; p1 < p1_end; p1 += CMP_SIZE, p2 += CMP_SIZE) { + if (ULOAD(p1) != ULOAD(p2)) { + return 1; + } + } + if (ULOAD(p1_end) != ULOAD(p2_end)) { + return 1; + } + } + + return 0; +} + +#undef CMP_T +#undef ULOAD +#undef TOUPPER +#undef CMP_SIZE + +#endif + diff --git a/regex/util/copybytes.h b/regex/util/copybytes.h new file mode 100644 index 000000000..7f37d96bc --- /dev/null +++ b/regex/util/copybytes.h @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2016-2020, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef COPY_BYTES_H +#define COPY_BYTES_H + +#include "unaligned.h" +#include "simd_utils.h" + +static really_inline +void copy_upto_64_bytes(u8 *dst, const u8 *src, unsigned int len) { + switch (len) { + case 0: + break; + case 1: + *dst = *src; + break; + case 2: + unaligned_store_u16(dst, unaligned_load_u16(src)); + break; + case 3: + unaligned_store_u16(dst, unaligned_load_u16(src)); + dst[2] = src[2]; + break; + case 4: + unaligned_store_u32(dst, unaligned_load_u32(src)); + break; + case 5: + case 6: + case 7: + unaligned_store_u32(dst + len - 4, unaligned_load_u32(src + len - 4)); + unaligned_store_u32(dst, unaligned_load_u32(src)); + break; + case 8: + unaligned_store_u64a(dst, unaligned_load_u64a(src)); + break; + case 9: + case 10: + case 11: + case 12: + case 13: + case 14: + case 15: + unaligned_store_u64a(dst + len - 8, unaligned_load_u64a(src + len - 8)); + unaligned_store_u64a(dst, unaligned_load_u64a(src)); + break; + case 16: + storeu128(dst, loadu128(src)); + break; + case 17: + case 18: + case 19: + case 20: + case 21: + case 22: + case 23: + case 24: + case 25: + case 26: + case 27: + case 28: + case 29: + case 30: + case 31: + storeu128(dst + len - 16, loadu128(src + len - 16)); + storeu128(dst, loadu128(src)); + break; + case 32: + storeu256(dst, loadu256(src)); + break; +#ifdef HAVE_AVX512 + case 64: + storebytes512(dst, loadu512(src), 64); + break; + default: + assert(len < 64); + u64a k = (1ULL << len) - 1; + storeu_mask_m512(dst, k, loadu_maskz_m512(k, src)); + break; +#else + default: + assert(0); + break; +#endif + } +} + +#endif diff --git a/regex/util/cpuid_flags.c b/regex/util/cpuid_flags.c new file mode 100644 index 000000000..c00ce58e2 --- /dev/null +++ b/regex/util/cpuid_flags.c @@ -0,0 +1,176 @@ +/* + * Copyright (c) 2015-2020, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "cpuid_flags.h" +#include "cpuid_inline.h" +#include "ue2common.h" +#include "hs_compile.h" // for HS_MODE_ flags +#include "hs_internal.h" +#include "util/arch.h" + +#if !defined(_WIN32) && !defined(CPUID_H_) +#include +#endif + +u64a cpuid_flags(void) { + u64a cap = 0; + + if (check_avx2()) { + DEBUG_PRINTF("AVX2 enabled\n"); + cap |= HS_CPU_FEATURES_AVX2; + } + + if (check_avx512()) { + DEBUG_PRINTF("AVX512 enabled\n"); + cap |= HS_CPU_FEATURES_AVX512; + } + + if (check_avx512vbmi()) { + DEBUG_PRINTF("AVX512VBMI enabled\n"); + cap |= HS_CPU_FEATURES_AVX512VBMI; + } + +#if !defined(FAT_RUNTIME) && !defined(HAVE_AVX2) + cap &= ~HS_CPU_FEATURES_AVX2; +#endif + +#if (!defined(FAT_RUNTIME) && !defined(HAVE_AVX512)) || \ + (defined(FAT_RUNTIME) && !defined(BUILD_AVX512)) + cap &= ~HS_CPU_FEATURES_AVX512; +#endif + +#if (!defined(FAT_RUNTIME) && !defined(HAVE_AVX512VBMI)) || \ + (defined(FAT_RUNTIME) && !defined(BUILD_AVX512VBMI)) + cap &= ~HS_CPU_FEATURES_AVX512VBMI; +#endif + + return cap; +} + +struct family_id { + u32 full_family; + u32 full_model; + u32 tune; +}; + +/* from table 35-1 of the Intel 64 and IA32 Arch. Software Developer's Manual + * and "Intel Architecture and Processor Identification With CPUID Model and + * Family Numbers" */ +static const struct family_id known_microarch[] = { + { 0x6, 0x37, HS_TUNE_FAMILY_SLM }, /* baytrail */ + { 0x6, 0x4A, HS_TUNE_FAMILY_SLM }, /* silvermont */ + { 0x6, 0x4C, HS_TUNE_FAMILY_SLM }, /* silvermont */ + { 0x6, 0x4D, HS_TUNE_FAMILY_SLM }, /* avoton, rangley */ + { 0x6, 0x5A, HS_TUNE_FAMILY_SLM }, /* silvermont */ + { 0x6, 0x5D, HS_TUNE_FAMILY_SLM }, /* silvermont */ + + { 0x6, 0x5C, HS_TUNE_FAMILY_GLM }, /* goldmont */ + { 0x6, 0x5F, HS_TUNE_FAMILY_GLM }, /* denverton */ + + { 0x6, 0x3C, HS_TUNE_FAMILY_HSW }, /* haswell */ + { 0x6, 0x45, HS_TUNE_FAMILY_HSW }, /* haswell */ + { 0x6, 0x46, HS_TUNE_FAMILY_HSW }, /* haswell */ + { 0x6, 0x3F, HS_TUNE_FAMILY_HSW }, /* haswell Xeon */ + + { 0x6, 0x3E, HS_TUNE_FAMILY_IVB }, /* ivybridge Xeon */ + { 0x6, 0x3A, HS_TUNE_FAMILY_IVB }, /* ivybridge */ + + { 0x6, 0x2A, HS_TUNE_FAMILY_SNB }, /* sandybridge */ + { 0x6, 0x2D, HS_TUNE_FAMILY_SNB }, /* sandybridge Xeon */ + + { 0x6, 0x3D, HS_TUNE_FAMILY_BDW }, /* broadwell Core-M */ + { 0x6, 0x47, HS_TUNE_FAMILY_BDW }, /* broadwell */ + { 0x6, 0x4F, HS_TUNE_FAMILY_BDW }, /* broadwell xeon */ + { 0x6, 0x56, HS_TUNE_FAMILY_BDW }, /* broadwell xeon-d */ + + { 0x6, 0x4E, HS_TUNE_FAMILY_SKL }, /* Skylake Mobile */ + { 0x6, 0x5E, HS_TUNE_FAMILY_SKL }, /* Skylake Core/E3 Xeon */ + { 0x6, 0x55, HS_TUNE_FAMILY_SKX }, /* Skylake Xeon */ + + { 0x6, 0x8E, HS_TUNE_FAMILY_SKL }, /* Kabylake Mobile */ + { 0x6, 0x9E, HS_TUNE_FAMILY_SKL }, /* Kabylake desktop */ + + { 0x6, 0x7D, HS_TUNE_FAMILY_ICL }, /* Icelake */ + { 0x6, 0x7E, HS_TUNE_FAMILY_ICL }, /* Icelake */ + { 0x6, 0x6A, HS_TUNE_FAMILY_ICX }, /* Icelake Xeon-D */ + { 0x6, 0x6C, HS_TUNE_FAMILY_ICX }, /* Icelake Xeon */ + +}; + +#ifdef DUMP_SUPPORT +static UNUSED +const char *dumpTune(u32 tune) { +#define T_CASE(x) case x: return #x; + switch (tune) { + T_CASE(HS_TUNE_FAMILY_SLM); + T_CASE(HS_TUNE_FAMILY_GLM); + T_CASE(HS_TUNE_FAMILY_HSW); + T_CASE(HS_TUNE_FAMILY_SNB); + T_CASE(HS_TUNE_FAMILY_IVB); + T_CASE(HS_TUNE_FAMILY_BDW); + T_CASE(HS_TUNE_FAMILY_SKL); + T_CASE(HS_TUNE_FAMILY_SKX); + T_CASE(HS_TUNE_FAMILY_ICL); + T_CASE(HS_TUNE_FAMILY_ICX); + } +#undef T_CASE + return "unknown"; +} +#endif + +u32 cpuid_tune(void) { + unsigned int eax, ebx, ecx, edx; + + cpuid(1, 0, &eax, &ebx, &ecx, &edx); + + u32 family = (eax >> 8) & 0xf; + u32 model = 0; + + if (family == 0x6 || family == 0xf) { + model = ((eax >> 4) & 0xf) | ((eax >> 12) & 0xf0); + } else { + model = (eax >> 4) & 0xf; + } + + DEBUG_PRINTF("family = %xh model = %xh\n", family, model); + for (u32 i = 0; i < ARRAY_LENGTH(known_microarch); i++) { + if (family != known_microarch[i].full_family) { + continue; + } + + if (model != known_microarch[i].full_model) { + continue; + } + + u32 tune = known_microarch[i].tune; + DEBUG_PRINTF("found tune flag %s\n", dumpTune(tune) ); + return tune; + } + + return HS_TUNE_FAMILY_GENERIC; +} diff --git a/regex/util/cpuid_flags.h b/regex/util/cpuid_flags.h new file mode 100644 index 000000000..527c6d52f --- /dev/null +++ b/regex/util/cpuid_flags.h @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef UTIL_CPUID_H_ +#define UTIL_CPUID_H_ + +#include "ue2common.h" + +#if !defined(_WIN32) && !defined(CPUID_H_) +#include + /* system header doesn't have a header guard */ +#define CPUID_H_ +#endif + +#ifdef __cplusplus +extern "C" +{ +#endif + +/* returns HS_CPU_FEATURES_* mask. */ +u64a cpuid_flags(void); + +u32 cpuid_tune(void); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* UTIL_CPUID_H_ */ + diff --git a/regex/util/cpuid_inline.h b/regex/util/cpuid_inline.h new file mode 100644 index 000000000..b7b424528 --- /dev/null +++ b/regex/util/cpuid_inline.h @@ -0,0 +1,260 @@ +/* + * Copyright (c) 2017-2020, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef CPUID_INLINE_H_ +#define CPUID_INLINE_H_ + +#include "ue2common.h" +#include "cpuid_flags.h" + +#if !defined(_WIN32) && !defined(CPUID_H_) +#include +/* system header doesn't have a header guard */ +#define CPUID_H_ +#endif + +#ifdef __cplusplus +extern "C" +{ +#endif + +static inline +void cpuid(unsigned int op, unsigned int leaf, unsigned int *eax, + unsigned int *ebx, unsigned int *ecx, unsigned int *edx) { +#ifndef _WIN32 + __cpuid_count(op, leaf, *eax, *ebx, *ecx, *edx); +#else + int a[4]; + __cpuidex(a, op, leaf); + *eax = a[0]; + *ebx = a[1]; + *ecx = a[2]; + *edx = a[3]; +#endif +} + +// ECX +#define CPUID_SSE3 (1 << 0) +#define CPUID_SSSE3 (1 << 9) +#define CPUID_SSE4_1 (1 << 19) +#define CPUID_SSE4_2 (1 << 20) +#define CPUID_POPCNT (1 << 23) +#define CPUID_XSAVE (1 << 27) +#define CPUID_AVX (1 << 28) + +// EDX +#define CPUID_FXSAVE (1 << 24) +#define CPUID_SSE (1 << 25) +#define CPUID_SSE2 (1 << 26) +#define CPUID_HTT (1 << 28) + +// Structured Extended Feature Flags Enumeration Leaf ECX values +#define CPUID_AVX512VBMI (1 << 1) + +// Structured Extended Feature Flags Enumeration Leaf EBX values +#define CPUID_BMI (1 << 3) +#define CPUID_AVX2 (1 << 5) +#define CPUID_BMI2 (1 << 8) +#define CPUID_AVX512F (1 << 16) +#define CPUID_AVX512BW (1 << 30) + +// Extended Control Register 0 (XCR0) values +#define CPUID_XCR0_SSE (1 << 1) +#define CPUID_XCR0_AVX (1 << 2) +#define CPUID_XCR0_OPMASK (1 << 5) // k-regs +#define CPUID_XCR0_ZMM_Hi256 (1 << 6) // upper 256 bits of ZMM0-ZMM15 +#define CPUID_XCR0_Hi16_ZMM (1 << 7) // ZMM16-ZMM31 + +#define CPUID_XCR0_AVX512 \ + (CPUID_XCR0_OPMASK | CPUID_XCR0_ZMM_Hi256 | CPUID_XCR0_Hi16_ZMM) + +static inline +u64a xgetbv(u32 op) { +#if defined(_WIN32) || defined(__INTEL_COMPILER) + return _xgetbv(op); +#else + u32 a, d; + __asm__ volatile ( + "xgetbv\n" + : "=a"(a), + "=d"(d) + : "c"(op)); + return ((u64a)d << 32) + a; +#endif +} + +static inline +int check_avx2(void) { +#if defined(__INTEL_COMPILER) + return _may_i_use_cpu_feature(_FEATURE_AVX2); +#else + unsigned int eax, ebx, ecx, edx; + + cpuid(1, 0, &eax, &ebx, &ecx, &edx); + + /* check AVX is supported and XGETBV is enabled by OS */ + if ((ecx & (CPUID_AVX | CPUID_XSAVE)) != (CPUID_AVX | CPUID_XSAVE)) { + DEBUG_PRINTF("AVX and XSAVE not supported\n"); + return 0; + } + + /* check that SSE and AVX registers are enabled by OS */ + u64a xcr0 = xgetbv(0); + if ((xcr0 & (CPUID_XCR0_SSE | CPUID_XCR0_AVX)) != + (CPUID_XCR0_SSE | CPUID_XCR0_AVX)) { + DEBUG_PRINTF("SSE and AVX registers not enabled\n"); + return 0; + } + + /* ECX and EDX contain capability flags */ + ecx = 0; + cpuid(7, 0, &eax, &ebx, &ecx, &edx); + + if (ebx & CPUID_AVX2) { + DEBUG_PRINTF("AVX2 enabled\n"); + return 1; + } + + return 0; +#endif +} + +static inline +int check_avx512(void) { + /* + * For our purposes, having avx512 really means "can we use AVX512BW?" + */ +#if defined(__INTEL_COMPILER) + return _may_i_use_cpu_feature(_FEATURE_AVX512BW | _FEATURE_AVX512VL); +#else + unsigned int eax, ebx, ecx, edx; + + cpuid(1, 0, &eax, &ebx, &ecx, &edx); + + /* check XSAVE is enabled by OS */ + if (!(ecx & CPUID_XSAVE)) { + DEBUG_PRINTF("AVX and XSAVE not supported\n"); + return 0; + } + + /* check that AVX 512 registers are enabled by OS */ + u64a xcr0 = xgetbv(0); + if ((xcr0 & CPUID_XCR0_AVX512) != CPUID_XCR0_AVX512) { + DEBUG_PRINTF("AVX512 registers not enabled\n"); + return 0; + } + + /* ECX and EDX contain capability flags */ + ecx = 0; + cpuid(7, 0, &eax, &ebx, &ecx, &edx); + + if (!(ebx & CPUID_AVX512F)) { + DEBUG_PRINTF("AVX512F (AVX512 Foundation) instructions not enabled\n"); + return 0; + } + + if (ebx & CPUID_AVX512BW) { + DEBUG_PRINTF("AVX512BW instructions enabled\n"); + return 1; + } + + return 0; +#endif +} + +static inline +int check_avx512vbmi(void) { +#if defined(__INTEL_COMPILER) + return _may_i_use_cpu_feature(_FEATURE_AVX512VBMI); +#else + unsigned int eax, ebx, ecx, edx; + + cpuid(1, 0, &eax, &ebx, &ecx, &edx); + + /* check XSAVE is enabled by OS */ + if (!(ecx & CPUID_XSAVE)) { + DEBUG_PRINTF("AVX and XSAVE not supported\n"); + return 0; + } + + /* check that AVX 512 registers are enabled by OS */ + u64a xcr0 = xgetbv(0); + if ((xcr0 & CPUID_XCR0_AVX512) != CPUID_XCR0_AVX512) { + DEBUG_PRINTF("AVX512 registers not enabled\n"); + return 0; + } + + /* ECX and EDX contain capability flags */ + ecx = 0; + cpuid(7, 0, &eax, &ebx, &ecx, &edx); + + if (!(ebx & CPUID_AVX512F)) { + DEBUG_PRINTF("AVX512F (AVX512 Foundation) instructions not enabled\n"); + return 0; + } + + if (!(ebx & CPUID_AVX512BW)) { + DEBUG_PRINTF("AVX512BW instructions not enabled\n"); + return 0; + } + + if (ecx & CPUID_AVX512VBMI) { + DEBUG_PRINTF("AVX512VBMI instructions enabled\n"); + return 1; + } + + return 0; +#endif +} + +static inline +int check_ssse3(void) { + unsigned int eax, ebx, ecx, edx; + cpuid(1, 0, &eax, &ebx, &ecx, &edx); + return !!(ecx & CPUID_SSSE3); +} + +static inline +int check_sse42(void) { + unsigned int eax, ebx, ecx, edx; + cpuid(1, 0, &eax, &ebx, &ecx, &edx); + return !!(ecx & CPUID_SSE4_2); +} + +static inline +int check_popcnt(void) { + unsigned int eax, ebx, ecx, edx; + cpuid(1, 0, &eax, &ebx, &ecx, &edx); + return !!(ecx & CPUID_POPCNT); +} + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* CPUID_INLINE_H_ */ diff --git a/regex/util/exhaust.h b/regex/util/exhaust.h new file mode 100644 index 000000000..d6f2ac06d --- /dev/null +++ b/regex/util/exhaust.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2015-2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Inline functions for manipulating exhaustion vector. + */ + +#ifndef EXHAUST_H +#define EXHAUST_H + +#include "ue2common.h" + +/** Index meaning a given exhaustion key is invalid. */ +#define INVALID_EKEY (~(u32)0) + +#endif diff --git a/regex/util/fatbit.h b/regex/util/fatbit.h new file mode 100644 index 000000000..3c65db1a5 --- /dev/null +++ b/regex/util/fatbit.h @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2015-2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef FATBIT_H +#define FATBIT_H + +/** \file + * \brief Multibit: fast bitset structure for use in scratch. + * Uses more space than mmbit, to avoid partial words for hopefully a taddy more + * performance. + * + * API is also trimmed down. + */ + +#include "multibit.h" +#include "ue2common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define MIN_FAT_SIZE 32 + +struct fatbit { + union { + u64a flat[MIN_FAT_SIZE / sizeof(u64a)]; + u8 raw[MIN_FAT_SIZE]; + } fb_int; + u64a tail[]; +}; + +static really_inline +void fatbit_clear(struct fatbit *bits) { + assert(ISALIGNED(bits)); + memset(bits, 0, sizeof(struct fatbit)); +} + +static really_inline +char fatbit_set(struct fatbit *bits, u32 total_bits, u32 key) { + assert(ISALIGNED(bits)); + return mmbit_set(bits->fb_int.raw, total_bits, key); +} + +static really_inline +void fatbit_unset(struct fatbit *bits, u32 total_bits, u32 key) { + assert(ISALIGNED(bits)); + mmbit_unset(bits->fb_int.raw, total_bits, key); +} + +static really_inline +char fatbit_isset(const struct fatbit *bits, u32 total_bits, u32 key) { + assert(ISALIGNED(bits)); + return mmbit_isset(bits->fb_int.raw, total_bits, key); +} + +static really_inline +u32 fatbit_iterate(const struct fatbit *bits, u32 total_bits, u32 it_in) { + assert(ISALIGNED(bits)); + /* TODO: iterate_flat could be specialised as we don't have to worry about + * partial blocks. */ + return mmbit_iterate(bits->fb_int.raw, total_bits, it_in); +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif diff --git a/regex/util/intrinsics.h b/regex/util/intrinsics.h new file mode 100644 index 000000000..0156f9ed3 --- /dev/null +++ b/regex/util/intrinsics.h @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Wrapper around the compiler supplied intrinsic header + */ + +#ifndef INTRINSICS_H +#define INTRINSICS_H + +#include "config.h" + +#ifdef __cplusplus +# if defined(HAVE_CXX_X86INTRIN_H) +# define USE_X86INTRIN_H +# endif +#else // C +# if defined(HAVE_C_X86INTRIN_H) +# define USE_X86INTRIN_H +# endif +#endif + +#ifdef __cplusplus +# if defined(HAVE_CXX_INTRIN_H) +# define USE_INTRIN_H +# endif +#else // C +# if defined(HAVE_C_INTRIN_H) +# define USE_INTRIN_H +# endif +#endif + +#if defined(USE_X86INTRIN_H) +#ifdef __KERNEL__ +#define _MM_MALLOC_H_INCLUDED +#endif +#include +#elif defined(USE_INTRIN_H) +#include +#else +#error no intrinsics file +#endif + +#endif // INTRINSICS_H diff --git a/regex/util/join.h b/regex/util/join.h new file mode 100644 index 000000000..7d5a30c39 --- /dev/null +++ b/regex/util/join.h @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2015, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef JOIN_H + +#define JOIN(x, y) JOIN_AGAIN(x, y) +#define JOIN_AGAIN(x, y) x ## y + +#define JOIN3(x, y, z) JOIN_AGAIN3(x, y, z) +#define JOIN_AGAIN3(x, y, z) x ## y ## z + +#define JOIN4(w, x, y, z) JOIN_AGAIN4(w, x, y, z) +#define JOIN_AGAIN4(w, x, y, z) w ## x ## y ## z + +#endif diff --git a/regex/util/logical.h b/regex/util/logical.h new file mode 100644 index 000000000..0c8b6469a --- /dev/null +++ b/regex/util/logical.h @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2018, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Inline functions for manipulating logical combinations. + */ + +#ifndef LOGICAL_H +#define LOGICAL_H + +#include "ue2common.h" + +/** Index meaning a given logical key is invalid. */ +#define INVALID_LKEY (~(u32)0) +#define INVALID_CKEY INVALID_LKEY + +/** Logical operation type, the priority is from high to low. */ +enum LogicalOpType { + LOGICAL_OP_NOT, + LOGICAL_OP_AND, + LOGICAL_OP_OR, + LAST_LOGICAL_OP = LOGICAL_OP_OR //!< Sentinel. +}; + +#define UNKNOWN_OP (~(u32)0) + +/** Logical Operation is consist of 4 parts. */ +struct LogicalOp { + u32 id; //!< logical operator/operation id + u32 op; //!< LogicalOpType + u32 lo; //!< left operand + u32 ro; //!< right operand +}; + +/** Each logical combination has its info: + * It occupies a region in LogicalOp vector. + * It has an exhaustion key for single-match mode. */ +struct CombInfo { + u32 id; + u32 ekey; //!< exhaustion key + u32 start; //!< ckey of logical operation to start calculating + u32 result; //!< ckey of logical operation to give final result + u64a min_offset; + u64a max_offset; +}; + +/** Temporarily use to seperate operations' id from reports' lkey + * when building logicalTree in shunting yard algorithm, + * operations' id will be finally renumbered following reports' lkey. */ +#define LOGICAL_OP_BIT 0x80000000UL + +#endif diff --git a/regex/util/masked_move.c b/regex/util/masked_move.c new file mode 100644 index 000000000..001cd49f2 --- /dev/null +++ b/regex/util/masked_move.c @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + + +#include "ue2common.h" +#include "masked_move.h" +#include "util/arch.h" + +#if defined(HAVE_AVX2) +/* masks for masked moves */ + +/* magic mask for maskload (vmmaskmovq) - described in UE-2424 */ +const ALIGN_CL_DIRECTIVE u32 mm_mask_mask[16] = { + 0x00000000U, + 0x00000000U, + 0x00000000U, + 0x00000000U, + 0x00000000U, + 0x00000000U, + 0x00000000U, + 0x00000000U, + 0xff000000U, + 0xfe000000U, + 0xfc000000U, + 0xf8000000U, + 0xf0000000U, + 0xe0000000U, + 0xc0000000U, + 0x80000000U, +}; + +const u32 mm_shuffle_end[32][8] = { + { 0x03020100U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, }, + { 0x02010080U, 0x80808003U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, }, + { 0x01008080U, 0x80800302U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, }, + { 0x00808080U, 0x80030201U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, }, + { 0x80808080U, 0x03020100U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, }, + { 0x80808080U, 0x02010080U, 0x80808003U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, }, + { 0x80808080U, 0x01008080U, 0x80800302U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, }, + { 0x80808080U, 0x00808080U, 0x80030201U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, }, + { 0x80808080U, 0x80808080U, 0x03020100U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, }, + { 0x80808080U, 0x80808080U, 0x02010080U, 0x80808003U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, }, + { 0x80808080U, 0x80808080U, 0x01008080U, 0x80800302U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, }, + { 0x80808080U, 0x80808080U, 0x00808080U, 0x80030201U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, }, + { 0x80808080U, 0x80808080U, 0x80808080U, 0x03020100U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, }, + { 0x80808080U, 0x80808080U, 0x80808080U, 0x02010080U, 0x80808003U, 0x80808080U, 0x80808080U, 0x80808080U, }, + { 0x80808080U, 0x80808080U, 0x80808080U, 0x01008080U, 0x80800302U, 0x80808080U, 0x80808080U, 0x80808080U, }, + { 0x80808080U, 0x80808080U, 0x80808080U, 0x00808080U, 0x80030201U, 0x80808080U, 0x80808080U, 0x80808080U, }, + { 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x03020100U, 0x80808080U, 0x80808080U, 0x80808080U, }, + { 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x02010080U, 0x80808003U, 0x80808080U, 0x80808080U, }, + { 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x01008080U, 0x80800302U, 0x80808080U, 0x80808080U, }, + { 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x00808080U, 0x80030201U, 0x80808080U, 0x80808080U, }, + { 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x03020100U, 0x80808080U, 0x80808080U, }, + { 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x02010080U, 0x80808003U, 0x80808080U, }, + { 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x01008080U, 0x80800302U, 0x80808080U, }, + { 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x00808080U, 0x80030201U, 0x80808080U, }, + { 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x03020100U, 0x80808080U, }, + { 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x02010080U, 0x80808003U, }, + { 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x01008080U, 0x80800302U, }, + { 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x00808080U, 0x80030201U, }, + { 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x03020100U, }, + { 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x02010080U, }, + { 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x01008080U, }, + { 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x80808080U, 0x00808080U, }, +}; +#endif // AVX2 diff --git a/regex/util/masked_move.h b/regex/util/masked_move.h new file mode 100644 index 000000000..4c877ca9e --- /dev/null +++ b/regex/util/masked_move.h @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef MASKED_MOVE_H +#define MASKED_MOVE_H + +#include "arch.h" + +#if defined(HAVE_AVX2) + +#include "unaligned.h" +#include "simd_utils.h" + +#ifdef __cplusplus +extern "C" { +#endif +extern const u32 mm_mask_mask[16]; +extern const u32 mm_shuffle_end[32][8]; +#ifdef __cplusplus +} +#endif + +/* load mask for len bytes from start of buffer */ +static really_inline m256 +_get_mm_mask_end(u32 len) { + assert(len <= 32); + const u8 *masky = (const u8 *)mm_mask_mask; + m256 mask = load256(masky + 32); + mask = _mm256_sll_epi32(mask, _mm_cvtsi32_si128(8 - (len >> 2))); + return mask; +} + +/* + * masked_move256_len: Will load len bytes from *buf into m256 + * _______________________________ + * |0<----len---->| 32| + * ------------------------------- + */ +static really_inline m256 +masked_move256_len(const u8 *buf, const u32 len) { + assert(len >= 4); + + m256 lmask = _get_mm_mask_end(len); + + u32 end = unaligned_load_u32(buf + len - 4); + m256 preshufend = _mm256_broadcastq_epi64(_mm_cvtsi32_si128(end)); + m256 v = _mm256_maskload_epi32((const int *)buf, lmask); + m256 shufend = pshufb_m256(preshufend, + loadu256(&mm_shuffle_end[len - 4])); + m256 target = or256(v, shufend); + + return target; +} + +#endif /* AVX2 */ +#endif /* MASKED_MOVE_H */ + diff --git a/regex/util/multibit.c b/regex/util/multibit.c new file mode 100644 index 000000000..de192d7dd --- /dev/null +++ b/regex/util/multibit.c @@ -0,0 +1,140 @@ +/* + * Copyright (c) 2015-2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Multibit: lookup tables and support code. + * + * This C file contains the constant tables used by multibit, so we don't end + * up creating copies of them for every unit that uses it. + */ + +#include "multibit.h" +#include "ue2common.h" + +const u8 mmbit_keyshift_lut[32] = { + 30, 30, 24, 24, 24, 24, 24, 24, 18, 18, 18, + 18, 18, 18, 12, 12, 12, 12, 12, 12, 6, 6, + 6, 6, 6, 6, 0, 0, 0, 0, 0, 0 +}; + +// The only actually valid values of ks are as shown in the LUT above, but a +// division is just too expensive. +const u8 mmbit_maxlevel_from_keyshift_lut[32] = { + 0, 0, 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, + 3, 3, 3, 3, 3, 3, + 4, 4, 4, 4, 4, 4, + 5, 5 +}; + +const u8 mmbit_maxlevel_direct_lut[32] = { + 5, 5, 4, 4, 4, 4, 4, 4, 3, 3, 3, + 3, 3, 3, 2, 2, 2, 2, 2, 2, 1, 1, + 1, 1, 1, 1, 0, 0, 0, 0, 0, 0 +}; + +#define ZERO_TO_LUT(x) ((1ULL << x) - 1) + +const u64a mmbit_zero_to_lut[65] = { + ZERO_TO_LUT(0), + ZERO_TO_LUT(1), + ZERO_TO_LUT(2), + ZERO_TO_LUT(3), + ZERO_TO_LUT(4), + ZERO_TO_LUT(5), + ZERO_TO_LUT(6), + ZERO_TO_LUT(7), + ZERO_TO_LUT(8), + ZERO_TO_LUT(9), + ZERO_TO_LUT(10), + ZERO_TO_LUT(11), + ZERO_TO_LUT(12), + ZERO_TO_LUT(13), + ZERO_TO_LUT(14), + ZERO_TO_LUT(15), + ZERO_TO_LUT(16), + ZERO_TO_LUT(17), + ZERO_TO_LUT(18), + ZERO_TO_LUT(19), + ZERO_TO_LUT(20), + ZERO_TO_LUT(21), + ZERO_TO_LUT(22), + ZERO_TO_LUT(23), + ZERO_TO_LUT(24), + ZERO_TO_LUT(25), + ZERO_TO_LUT(26), + ZERO_TO_LUT(27), + ZERO_TO_LUT(28), + ZERO_TO_LUT(29), + ZERO_TO_LUT(30), + ZERO_TO_LUT(31), + ZERO_TO_LUT(32), + ZERO_TO_LUT(33), + ZERO_TO_LUT(34), + ZERO_TO_LUT(35), + ZERO_TO_LUT(36), + ZERO_TO_LUT(37), + ZERO_TO_LUT(38), + ZERO_TO_LUT(39), + ZERO_TO_LUT(40), + ZERO_TO_LUT(41), + ZERO_TO_LUT(42), + ZERO_TO_LUT(43), + ZERO_TO_LUT(44), + ZERO_TO_LUT(45), + ZERO_TO_LUT(46), + ZERO_TO_LUT(47), + ZERO_TO_LUT(48), + ZERO_TO_LUT(49), + ZERO_TO_LUT(50), + ZERO_TO_LUT(51), + ZERO_TO_LUT(52), + ZERO_TO_LUT(53), + ZERO_TO_LUT(54), + ZERO_TO_LUT(55), + ZERO_TO_LUT(56), + ZERO_TO_LUT(57), + ZERO_TO_LUT(58), + ZERO_TO_LUT(59), + ZERO_TO_LUT(60), + ZERO_TO_LUT(61), + ZERO_TO_LUT(62), + ZERO_TO_LUT(63), + ~0ULL +}; + +const u32 mmbit_root_offset_from_level[7] = { + 0, + 1, + 1 + (1 << MMB_KEY_SHIFT), + 1 + (1 << MMB_KEY_SHIFT) + (1 << MMB_KEY_SHIFT * 2), + 1 + (1 << MMB_KEY_SHIFT) + (1 << MMB_KEY_SHIFT * 2) + (1 << MMB_KEY_SHIFT * 3), + 1 + (1 << MMB_KEY_SHIFT) + (1 << MMB_KEY_SHIFT * 2) + (1 << MMB_KEY_SHIFT * 3) + (1 << MMB_KEY_SHIFT * 4), + 1 + (1 << MMB_KEY_SHIFT) + (1 << MMB_KEY_SHIFT * 2) + (1 << MMB_KEY_SHIFT * 3) + (1 << MMB_KEY_SHIFT * 4) + (1 << MMB_KEY_SHIFT * 5), +}; diff --git a/regex/util/multibit.h b/regex/util/multibit.h new file mode 100644 index 000000000..8697fb90a --- /dev/null +++ b/regex/util/multibit.h @@ -0,0 +1,1506 @@ +/* + * Copyright (c) 2015-2018, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Multibit: fast bitset structure, main runtime. + * + * *Structure* + * + * For sizes <= MMB_FLAT_MAX_BITS, a flat bit vector is used, stored as N + * 64-bit blocks followed by one "runt block". + * + * In larger cases, we use a sequence of blocks forming a tree. Each bit in an + * internal block indicates whether its child block contains valid data. Every + * level bar the last is complete. The last level is just a basic bit vector. + * + * ----------------------------------------------------------------------------- + * WARNING: + * + * mmbit code assumes that it is legal to load 8 bytes before the end of the + * mmbit. This means that for small mmbits (< 8byte), data may be read from + * before the base pointer. It is the user's responsibility to ensure that this + * is possible. + * ----------------------------------------------------------------------------- + */ +#ifndef MULTIBIT_H +#define MULTIBIT_H + +#include "config.h" +#include "ue2common.h" +#include "bitutils.h" +#include "partial_store.h" +#include "unaligned.h" +#include "multibit_internal.h" + +#ifndef __KERNEL__ +#include +#else +#include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +#define MMB_ONE (1ULL) +#define MMB_ALL_ONES (0xffffffffffffffffULL) + +/** \brief Number of bits in a block. */ +#define MMB_KEY_BITS (sizeof(MMB_TYPE) * 8) + +#define MMB_KEY_MASK (MMB_KEY_BITS - 1) + +// Key structure defines +#define MMB_KEY_SHIFT 6 + +/** \brief Max size of a flat multibit. */ +#define MMB_FLAT_MAX_BITS 256 + +// Utility functions and data +// see multibit.c for contents +extern const u8 mmbit_keyshift_lut[32]; +extern const u8 mmbit_maxlevel_from_keyshift_lut[32]; +extern const u8 mmbit_maxlevel_direct_lut[32]; +extern const u32 mmbit_root_offset_from_level[7]; +extern const u64a mmbit_zero_to_lut[65]; + +static really_inline +MMB_TYPE mmb_load(const u8 * bits) { + return unaligned_load_u64a(bits); +} + +static really_inline +void mmb_store(u8 *bits, MMB_TYPE val) { + unaligned_store_u64a(bits, val); +} + +static really_inline +void mmb_store_partial(u8 *bits, MMB_TYPE val, u32 block_bits) { + assert(block_bits <= MMB_KEY_BITS); + partial_store_u64a(bits, val, ROUNDUP_N(block_bits, 8U) / 8U); +} + +static really_inline +MMB_TYPE mmb_single_bit(u32 bit) { + assert(bit < MMB_KEY_BITS); + return MMB_ONE << bit; +} + +static really_inline +MMB_TYPE mmb_mask_zero_to(u32 bit) { + assert(bit <= MMB_KEY_BITS); +#ifdef ARCH_32_BIT + return mmbit_zero_to_lut[bit]; +#else + if (bit == MMB_KEY_BITS) { + return MMB_ALL_ONES; + } else { + return mmb_single_bit(bit) - MMB_ONE; + } +#endif +} + +/** \brief Returns a mask of set bits up to position \a bit. Does not handle + * the case where bit == MMB_KEY_BITS. */ +static really_inline +MMB_TYPE mmb_mask_zero_to_nocheck(u32 bit) { + assert(bit < MMB_KEY_BITS); +#ifdef ARCH_32_BIT + return mmbit_zero_to_lut[bit]; +#else + return mmb_single_bit(bit) - MMB_ONE; +#endif +} + +static really_inline +u32 mmb_test(MMB_TYPE val, u32 bit) { + assert(bit < MMB_KEY_BITS); + return (val >> bit) & MMB_ONE; +} + +static really_inline +void mmb_set(MMB_TYPE * val, u32 bit) { + assert(bit < MMB_KEY_BITS); + *val |= mmb_single_bit(bit); +} + +static really_inline +void mmb_clear(MMB_TYPE * val, u32 bit) { + assert(bit < MMB_KEY_BITS); + *val &= ~mmb_single_bit(bit); +} + +static really_inline +u32 mmb_ctz(MMB_TYPE val) { + return ctz64(val); +} + +static really_inline +u32 mmb_popcount(MMB_TYPE val) { + return popcount64(val); +} + +#ifndef MMMB_DEBUG +#define MDEBUG_PRINTF(x, ...) do { } while(0) +#else +#define MDEBUG_PRINTF DEBUG_PRINTF +#endif + +// Switch the following define on to trace writes to multibit. +//#define MMB_TRACE_WRITES +#ifdef MMB_TRACE_WRITES +#define MMB_TRACE(format, ...) \ + printf("mmb [%u bits @ %p] " format, total_bits, bits, ##__VA_ARGS__) +#else +#define MMB_TRACE(format, ...) \ + do { \ + } while (0) +#endif + +static really_inline +u32 mmbit_keyshift(u32 total_bits) { + assert(total_bits > 1); + u32 n = clz32(total_bits - 1); // subtract one as we're rounding down + return mmbit_keyshift_lut[n]; +} + +static really_inline +u32 mmbit_maxlevel(u32 total_bits) { + assert(total_bits > 1); + u32 n = clz32(total_bits - 1); // subtract one as we're rounding down + u32 max_level = mmbit_maxlevel_direct_lut[n]; + assert(max_level <= MMB_MAX_LEVEL); + return max_level; +} + +static really_inline +u32 mmbit_maxlevel_from_keyshift(u32 ks) { + assert(ks <= 30); + assert(ks % MMB_KEY_SHIFT == 0); + + u32 max_level = mmbit_maxlevel_from_keyshift_lut[ks]; + assert(max_level <= MMB_MAX_LEVEL); + return max_level; +} + +/** \brief get our keyshift for the current level */ +static really_inline +u32 mmbit_get_ks(u32 max_level, u32 level) { + assert(max_level <= MMB_MAX_LEVEL); + assert(level <= max_level); + return (max_level - level) * MMB_KEY_SHIFT; +} + +/** \brief get our key value for the current level */ +static really_inline +u32 mmbit_get_key_val(u32 max_level, u32 level, u32 key) { + return (key >> mmbit_get_ks(max_level, level)) & MMB_KEY_MASK; +} + +/** \brief get the level root for the current level */ +static really_inline +u8 *mmbit_get_level_root(u8 *bits, u32 level) { + assert(level < ARRAY_LENGTH(mmbit_root_offset_from_level)); + return bits + mmbit_root_offset_from_level[level] * sizeof(MMB_TYPE); +} + +/** \brief get the level root for the current level as const */ +static really_inline +const u8 *mmbit_get_level_root_const(const u8 *bits, u32 level) { + assert(level < ARRAY_LENGTH(mmbit_root_offset_from_level)); + return bits + mmbit_root_offset_from_level[level] * sizeof(MMB_TYPE); +} + +/** \brief get the block for this key on the current level as a u8 ptr */ +static really_inline +u8 *mmbit_get_block_ptr(u8 *bits, u32 max_level, u32 level, u32 key) { + u8 *level_root = mmbit_get_level_root(bits, level); + u32 ks = mmbit_get_ks(max_level, level); + return level_root + ((u64a)key >> (ks + MMB_KEY_SHIFT)) * sizeof(MMB_TYPE); +} + +/** \brief get the block for this key on the current level as a const u8 ptr */ +static really_inline +const u8 *mmbit_get_block_ptr_const(const u8 *bits, u32 max_level, u32 level, + u32 key) { + const u8 *level_root = mmbit_get_level_root_const(bits, level); + u32 ks = mmbit_get_ks(max_level, level); + return level_root + ((u64a)key >> (ks + MMB_KEY_SHIFT)) * sizeof(MMB_TYPE); +} + +/** \brief get the _byte_ for this key on the current level as a u8 ptr */ +static really_inline +u8 *mmbit_get_byte_ptr(u8 *bits, u32 max_level, u32 level, u32 key) { + u8 *level_root = mmbit_get_level_root(bits, level); + u32 ks = mmbit_get_ks(max_level, level); + return level_root + ((u64a)key >> (ks + MMB_KEY_SHIFT - 3)); +} + +/** \brief get our key value for the current level */ +static really_inline +u32 mmbit_get_key_val_byte(u32 max_level, u32 level, u32 key) { + return (key >> (mmbit_get_ks(max_level, level))) & 0x7; +} + +/** \brief Load a flat bitvector block corresponding to N bits. */ +static really_inline +MMB_TYPE mmbit_get_flat_block(const u8 *bits, u32 n_bits) { + assert(n_bits <= MMB_KEY_BITS); + u32 n_bytes = ROUNDUP_N(n_bits, 8) / 8; + switch (n_bytes) { + case 1: + return *bits; + case 2: + return unaligned_load_u16(bits); + case 3: + case 4: { + u32 rv; + assert(n_bytes <= sizeof(rv)); + memcpy(&rv, bits + n_bytes - sizeof(rv), sizeof(rv)); + rv >>= (sizeof(rv) - n_bytes) * 8; /* need to shift to get things in + * the right position and remove + * junk */ + assert(rv == partial_load_u32(bits, n_bytes)); + return rv; + } + default: { + u64a rv; + assert(n_bytes <= sizeof(rv)); + memcpy(&rv, bits + n_bytes - sizeof(rv), sizeof(rv)); + rv >>= (sizeof(rv) - n_bytes) * 8; /* need to shift to get things in + * the right position and remove + * junk */ + assert(rv == partial_load_u64a(bits, n_bytes)); + return rv; + } + } +} + +/** \brief True if this multibit is small enough to use a flat model */ +static really_inline +u32 mmbit_is_flat_model(u32 total_bits) { + return total_bits <= MMB_FLAT_MAX_BITS; +} + +static really_inline +u32 mmbit_flat_size(u32 total_bits) { + assert(mmbit_is_flat_model(total_bits)); + return ROUNDUP_N(total_bits, 8) / 8; +} + +static really_inline +u32 mmbit_flat_select_byte(u32 key, UNUSED u32 total_bits) { + return key / 8; +} + +/** \brief returns the dense index of the bit in the given mask. */ +static really_inline +u32 mmbit_mask_index(u32 bit, MMB_TYPE mask) { + assert(bit < MMB_KEY_BITS); + assert(mmb_test(mask, bit)); + + mask &= mmb_mask_zero_to(bit); + if (mask == 0ULL) { + return 0; // Common case. + } + return mmb_popcount(mask); +} + +/** \brief Clear all bits. */ +static really_inline +void mmbit_clear(u8 *bits, u32 total_bits) { + MDEBUG_PRINTF("%p total_bits %u\n", bits, total_bits); + MMB_TRACE("CLEAR\n"); + if (!total_bits) { + return; + } + if (mmbit_is_flat_model(total_bits)) { + memset(bits, 0, mmbit_flat_size(total_bits)); + return; + } + mmb_store(bits, 0); +} + +/** \brief Specialisation of \ref mmbit_set for flat models. */ +static really_inline +char mmbit_set_flat(u8 *bits, u32 total_bits, u32 key) { + bits += mmbit_flat_select_byte(key, total_bits); + u8 mask = 1U << (key % 8); + char was_set = !!(*bits & mask); + *bits |= mask; + return was_set; +} + +static really_inline +char mmbit_set_big(u8 *bits, u32 total_bits, u32 key) { + const u32 max_level = mmbit_maxlevel(total_bits); + u32 level = 0; + do { + u8 * byte_ptr = mmbit_get_byte_ptr(bits, max_level, level, key); + u8 keymask = 1U << mmbit_get_key_val_byte(max_level, level, key); + u8 byte = *byte_ptr; + if (likely(!(byte & keymask))) { + *byte_ptr = byte | keymask; + while (level++ != max_level) { + u8 *block_ptr_1 = mmbit_get_block_ptr(bits, max_level, level, key); + MMB_TYPE keymask_1 = mmb_single_bit(mmbit_get_key_val(max_level, level, key)); + mmb_store(block_ptr_1, keymask_1); + } + return 0; + } + } while (level++ != max_level); + return 1; +} + +/** Internal version of \ref mmbit_set without MMB_TRACE, so it can be used by + * \ref mmbit_sparse_iter_dump. */ +static really_inline +char mmbit_set_i(u8 *bits, u32 total_bits, u32 key) { + assert(key < total_bits); + if (mmbit_is_flat_model(total_bits)) { + return mmbit_set_flat(bits, total_bits, key); + } else { + return mmbit_set_big(bits, total_bits, key); + } +} + +static really_inline +char mmbit_isset(const u8 *bits, u32 total_bits, u32 key); + +/** \brief Sets the given key in the multibit. Returns 0 if the key was NOT + * already set, 1 otherwise. */ +static really_inline +char mmbit_set(u8 *bits, u32 total_bits, u32 key) { + MDEBUG_PRINTF("%p total_bits %u key %u\n", bits, total_bits, key); + char status = mmbit_set_i(bits, total_bits, key); + MMB_TRACE("SET %u (prev status: %d)\n", key, (int)status); + assert(mmbit_isset(bits, total_bits, key)); + return status; +} + +/** \brief Specialisation of \ref mmbit_isset for flat models. */ +static really_inline +char mmbit_isset_flat(const u8 *bits, u32 total_bits, u32 key) { + bits += mmbit_flat_select_byte(key, total_bits); + return !!(*bits & (1U << (key % 8U))); +} + +static really_inline +char mmbit_isset_big(const u8 *bits, u32 total_bits, u32 key) { + const u32 max_level = mmbit_maxlevel(total_bits); + u32 level = 0; + do { + const u8 *block_ptr = mmbit_get_block_ptr_const(bits, max_level, level, key); + MMB_TYPE block = mmb_load(block_ptr); + if (!mmb_test(block, mmbit_get_key_val(max_level, level, key))) { + return 0; + } + } while (level++ != max_level); + return 1; +} + +/** \brief Returns whether the given key is set. */ +static really_inline +char mmbit_isset(const u8 *bits, u32 total_bits, u32 key) { + MDEBUG_PRINTF("%p total_bits %u key %u\n", bits, total_bits, key); + assert(key < total_bits); + if (mmbit_is_flat_model(total_bits)) { + return mmbit_isset_flat(bits, total_bits, key); + } else { + return mmbit_isset_big(bits, total_bits, key); + } +} + +/** \brief Specialisation of \ref mmbit_unset for flat models. */ +static really_inline +void mmbit_unset_flat(u8 *bits, u32 total_bits, u32 key) { + bits += mmbit_flat_select_byte(key, total_bits); + *bits &= ~(1U << (key % 8U)); +} + +// TODO: +// build two versions of this - unset_dangerous that doesn't clear the summary +// block and a regular unset that actually clears ALL the way up the levels if +// possible - might make a utility function for the clear +static really_inline +void mmbit_unset_big(u8 *bits, u32 total_bits, u32 key) { + /* This function is lazy as it does not clear the summary block + * entry if the child becomes empty. This is not a correctness problem as the + * summary block entries are used to mean that their children are valid + * rather than that they have a set child. */ + const u32 max_level = mmbit_maxlevel(total_bits); + u32 level = 0; + do { + u8 *block_ptr = mmbit_get_block_ptr(bits, max_level, level, key); + u32 key_val = mmbit_get_key_val(max_level, level, key); + MMB_TYPE block = mmb_load(block_ptr); + if (!mmb_test(block, key_val)) { + return; + } + if (level == max_level) { + mmb_clear(&block, key_val); + mmb_store(block_ptr, block); + } + } while (level++ != max_level); +} + +/** \brief Switch off a given key. */ +static really_inline +void mmbit_unset(u8 *bits, u32 total_bits, u32 key) { + MDEBUG_PRINTF("%p total_bits %u key %u\n", bits, total_bits, key); + assert(key < total_bits); + MMB_TRACE("UNSET %u (prev status: %d)\n", key, + (int)mmbit_isset(bits, total_bits, key)); + + if (mmbit_is_flat_model(total_bits)) { + mmbit_unset_flat(bits, total_bits, key); + } else { + mmbit_unset_big(bits, total_bits, key); + } +} + +/** \brief Specialisation of \ref mmbit_iterate for flat models. */ +static really_inline +u32 mmbit_iterate_flat(const u8 *bits, u32 total_bits, u32 it_in) { + // Short cut for single-block cases. + if (total_bits <= MMB_KEY_BITS) { + MMB_TYPE block = mmbit_get_flat_block(bits, total_bits); + if (it_in != MMB_INVALID) { + it_in++; + assert(it_in < total_bits); + block &= ~mmb_mask_zero_to(it_in); + } + if (block) { + return mmb_ctz(block); + } + return MMB_INVALID; + } + + const u32 last_block = total_bits / MMB_KEY_BITS; + u32 start; // starting block index + + if (it_in != MMB_INVALID) { + it_in++; + assert(it_in < total_bits); + + start = (ROUNDUP_N(it_in, MMB_KEY_BITS) / MMB_KEY_BITS) - 1; + u32 start_key = start * MMB_KEY_BITS; + u32 block_size = MIN(MMB_KEY_BITS, total_bits - start_key); + MMB_TYPE block = + mmbit_get_flat_block(bits + (start * sizeof(MMB_TYPE)), block_size); + block &= ~mmb_mask_zero_to(it_in - start_key); + + if (block) { + return start_key + mmb_ctz(block); + } else if (start_key + MMB_KEY_BITS >= total_bits) { + return MMB_INVALID; // That was the final block. + } + start++; + } else { + start = 0; + } + + // Remaining full-sized blocks. + for (; start < last_block; start++) { + MMB_TYPE block = mmb_load(bits + (start * sizeof(MMB_TYPE))); + if (block) { + return (start * MMB_KEY_BITS) + mmb_ctz(block); + } + } + + // We may have a final, smaller than full-sized, block to deal with at the + // end. + if (total_bits % MMB_KEY_BITS) { + u32 start_key = start * MMB_KEY_BITS; + u32 block_size = MIN(MMB_KEY_BITS, total_bits - start_key); + MMB_TYPE block = + mmbit_get_flat_block(bits + (start * sizeof(MMB_TYPE)), block_size); + if (block) { + return start_key + mmb_ctz(block); + } + } + + return MMB_INVALID; +} + +static really_inline +u32 mmbit_iterate_big(const u8 * bits, u32 total_bits, u32 it_in) { + const u32 max_level = mmbit_maxlevel(total_bits); + u32 level = 0; + u32 key = 0; + u32 key_rem = 0; + + if (it_in != MMB_INVALID) { + // We're continuing a previous iteration, so we need to go + // to max_level so we can pick up where we left off. + // NOTE: assumes that we're valid down the whole tree + key = it_in >> MMB_KEY_SHIFT; + key_rem = (it_in & MMB_KEY_MASK) + 1; + level = max_level; + } + while (1) { + if (key_rem < MMB_KEY_BITS) { + const u8 *block_ptr = mmbit_get_level_root_const(bits, level) + + key * sizeof(MMB_TYPE); + MMB_TYPE block + = mmb_load(block_ptr) & ~mmb_mask_zero_to_nocheck(key_rem); + if (block) { + key = (key << MMB_KEY_SHIFT) + mmb_ctz(block); + if (level++ == max_level) { + break; + } + key_rem = 0; + continue; // jump the rootwards step if we found a 'tree' non-zero bit + } + } + // rootwards step (block is zero or key_rem == MMB_KEY_BITS) + if (level-- == 0) { + return MMB_INVALID; // if we don't find anything and we're at the top level, we're done + } + key_rem = (key & MMB_KEY_MASK) + 1; + key >>= MMB_KEY_SHIFT; + } + assert(key < total_bits); + assert(mmbit_isset(bits, total_bits, key)); + return key; +} + +/** \brief Unbounded iterator. Returns the index of the next set bit after \a + * it_in, or MMB_INVALID. + * + * Note: assumes that if you pass in a value of it_in other than MMB_INVALID, + * that bit must be on (assumes all its summary blocks are set). + */ +static really_inline +u32 mmbit_iterate(const u8 *bits, u32 total_bits, u32 it_in) { + MDEBUG_PRINTF("%p total_bits %u it_in %u\n", bits, total_bits, it_in); + assert(it_in < total_bits || it_in == MMB_INVALID); + if (!total_bits) { + return MMB_INVALID; + } + if (it_in == total_bits - 1) { + return MMB_INVALID; // it_in is the last key. + } + + u32 key; + if (mmbit_is_flat_model(total_bits)) { + key = mmbit_iterate_flat(bits, total_bits, it_in); + } else { + key = mmbit_iterate_big(bits, total_bits, it_in); + } + assert(key == MMB_INVALID || mmbit_isset(bits, total_bits, key)); + return key; +} + +/** \brief Specialisation of \ref mmbit_any and \ref mmbit_any_precise for flat + * models. */ +static really_inline +char mmbit_any_flat(const u8 *bits, u32 total_bits) { + if (total_bits <= MMB_KEY_BITS) { + return !!mmbit_get_flat_block(bits, total_bits); + } + + const u8 *end = bits + mmbit_flat_size(total_bits); + for (const u8 *last = end - sizeof(MMB_TYPE); bits < last; + bits += sizeof(MMB_TYPE)) { + if (mmb_load(bits)) { + return 1; + } + } + + // Overlapping load at the end. + return !!mmb_load(end - sizeof(MMB_TYPE)); +} + +/** \brief True if any keys are (or might be) on in the given multibit. + * + * NOTE: mmbit_any is sloppy (may return true when only summary bits are set). + * Use \ref mmbit_any_precise if you need/want a correct answer. + */ +static really_inline +char mmbit_any(const u8 *bits, u32 total_bits) { + MDEBUG_PRINTF("%p total_bits %u\n", bits, total_bits); + if (!total_bits) { + return 0; + } + if (mmbit_is_flat_model(total_bits)) { + return mmbit_any_flat(bits, total_bits); + } + return !!mmb_load(bits); +} + +/** \brief True if there are any keys on. Guaranteed precise. */ +static really_inline +char mmbit_any_precise(const u8 *bits, u32 total_bits) { + MDEBUG_PRINTF("%p total_bits %u\n", bits, total_bits); + if (!total_bits) { + return 0; + } + if (mmbit_is_flat_model(total_bits)) { + return mmbit_any_flat(bits, total_bits); + } + + return mmbit_iterate_big(bits, total_bits, MMB_INVALID) != MMB_INVALID; +} + +static really_inline +char mmbit_all_flat(const u8 *bits, u32 total_bits) { + while (total_bits > MMB_KEY_BITS) { + if (mmb_load(bits) != MMB_ALL_ONES) { + return 0; + } + bits += sizeof(MMB_TYPE); + total_bits -= MMB_KEY_BITS; + } + while (total_bits > 8) { + if (*bits != 0xff) { + return 0; + } + bits++; + total_bits -= 8; + } + u8 mask = (u8)mmb_mask_zero_to_nocheck(total_bits); + return (*bits & mask) == mask; +} + +static really_inline +char mmbit_all_big(const u8 *bits, u32 total_bits) { + u32 ks = mmbit_keyshift(total_bits); + + u32 level = 0; + for (;;) { + // Number of bits we expect to see switched on on this level. + u32 level_bits; + if (ks != 0) { + u32 next_level_width = MMB_KEY_BITS << (ks - MMB_KEY_SHIFT); + level_bits = ROUNDUP_N(total_bits, next_level_width) >> ks; + } else { + level_bits = total_bits; + } + + const u8 *block_ptr = mmbit_get_level_root_const(bits, level); + + // All full-size blocks should be all-ones. + while (level_bits >= MMB_KEY_BITS) { + MMB_TYPE block = mmb_load(block_ptr); + if (block != MMB_ALL_ONES) { + return 0; + } + block_ptr += sizeof(MMB_TYPE); + level_bits -= MMB_KEY_BITS; + } + + // If we have bits remaining, we have a runt block on the end. + if (level_bits > 0) { + MMB_TYPE block = mmb_load(block_ptr); + MMB_TYPE mask = mmb_mask_zero_to_nocheck(level_bits); + if ((block & mask) != mask) { + return 0; + } + } + + if (ks == 0) { + break; + } + + ks -= MMB_KEY_SHIFT; + level++; + } + + return 1; +} + +/** \brief True if all keys are on. Guaranteed precise. */ +static really_inline +char mmbit_all(const u8 *bits, u32 total_bits) { + MDEBUG_PRINTF("%p total_bits %u\n", bits, total_bits); + + if (mmbit_is_flat_model(total_bits)) { + return mmbit_all_flat(bits, total_bits); + } + return mmbit_all_big(bits, total_bits); +} + +static really_inline +MMB_TYPE get_flat_masks(u32 base, u32 it_start, u32 it_end) { + if (it_end <= base) { + return 0; + } + u32 udiff = it_end - base; + MMB_TYPE mask = udiff < 64 ? mmb_mask_zero_to_nocheck(udiff) : MMB_ALL_ONES; + if (it_start >= base) { + u32 ldiff = it_start - base; + MMB_TYPE lmask = ldiff < 64 ? ~mmb_mask_zero_to_nocheck(ldiff) : 0; + mask &= lmask; + } + return mask; +} + +/** \brief Specialisation of \ref mmbit_iterate_bounded for flat models. */ +static really_inline +u32 mmbit_iterate_bounded_flat(const u8 *bits, u32 total_bits, u32 begin, + u32 end) { + // Short cut for single-block cases. + if (total_bits <= MMB_KEY_BITS) { + MMB_TYPE block = mmbit_get_flat_block(bits, total_bits); + block &= get_flat_masks(0, begin, end); + if (block) { + return mmb_ctz(block); + } + return MMB_INVALID; + } + + const u32 last_block = ROUNDDOWN_N(total_bits, MMB_KEY_BITS); + + // Iterate over full-sized blocks. + for (u32 i = ROUNDDOWN_N(begin, MMB_KEY_BITS), e = MIN(end, last_block); + i < e; i += MMB_KEY_BITS) { + const u8 *block_ptr = bits + i / 8; + MMB_TYPE block = mmb_load(block_ptr); + block &= get_flat_masks(i, begin, end); + if (block) { + return i + mmb_ctz(block); + } + } + + // Final block, which is less than full-sized. + if (end > last_block) { + const u8 *block_ptr = bits + last_block / 8; + u32 num_bits = total_bits - last_block; + MMB_TYPE block = mmbit_get_flat_block(block_ptr, num_bits); + block &= get_flat_masks(last_block, begin, end); + if (block) { + return last_block + mmb_ctz(block); + } + } + + return MMB_INVALID; +} + +static really_inline +MMB_TYPE get_lowhi_masks(u32 level, u32 max_level, u64a block_min, u64a block_max, + u64a block_base) { + const u32 level_shift = (max_level - level) * MMB_KEY_SHIFT; + u64a lshift = (block_min - block_base) >> level_shift; + u64a ushift = (block_max - block_base) >> level_shift; + MMB_TYPE lmask = lshift < 64 ? ~mmb_mask_zero_to_nocheck(lshift) : 0; + MMB_TYPE umask = + ushift < 63 ? mmb_mask_zero_to_nocheck(ushift + 1) : MMB_ALL_ONES; + return lmask & umask; +} + +static really_inline +u32 mmbit_iterate_bounded_big(const u8 *bits, u32 total_bits, u32 it_start, u32 it_end) { + u64a key = 0; + u32 ks = mmbit_keyshift(total_bits); + const u32 max_level = mmbit_maxlevel_from_keyshift(ks); + u32 level = 0; + --it_end; // make end-limit inclusive + for (;;) { + assert(level <= max_level); + + u64a block_width = MMB_KEY_BITS << ks; + u64a block_base = key * block_width; + u64a block_min = MAX(it_start, block_base); + u64a block_max = MIN(it_end, block_base + block_width - 1); + const u8 *block_ptr = + mmbit_get_level_root_const(bits, level) + key * sizeof(MMB_TYPE); + MMB_TYPE block = mmb_load(block_ptr); + block &= get_lowhi_masks(level, max_level, block_min, block_max, block_base); + if (block) { + // Found a bit, go down a level + key = (key << MMB_KEY_SHIFT) + mmb_ctz(block); + if (level++ == max_level) { + return key; + } + ks -= MMB_KEY_SHIFT; + } else { + // No bit found, go up a level + // we know that this block didn't have any answers, so we can push + // our start iterator forward. + u64a next_start = block_base + block_width; + if (next_start > it_end) { + break; + } + if (level-- == 0) { + break; + } + it_start = next_start; + key >>= MMB_KEY_SHIFT; + ks += MMB_KEY_SHIFT; + } + } + return MMB_INVALID; +} + +/** \brief Bounded iterator. Returns the index of the first set bit between + * it_start (inclusive) and it_end (exclusive) or MMB_INVALID if no bits are + * set in that range. + */ +static really_inline +u32 mmbit_iterate_bounded(const u8 *bits, u32 total_bits, u32 it_start, + u32 it_end) { + MDEBUG_PRINTF("%p total_bits %u it_start %u it_end %u\n", bits, total_bits, + it_start, it_end); + assert(it_start <= it_end); + assert(it_end <= total_bits); + if (!total_bits || it_end == it_start) { + return MMB_INVALID; + } + assert(it_start < total_bits); + u32 key; + if (mmbit_is_flat_model(total_bits)) { + key = mmbit_iterate_bounded_flat(bits, total_bits, it_start, it_end); + } else { + key = mmbit_iterate_bounded_big(bits, total_bits, it_start, it_end); + } + assert(key == MMB_INVALID || mmbit_isset(bits, total_bits, key)); + return key; +} + +/** \brief Specialisation of \ref mmbit_unset_range for flat models. */ +static really_inline +void mmbit_unset_range_flat(u8 *bits, u32 total_bits, u32 begin, u32 end) { + const u32 last_block = ROUNDDOWN_N(total_bits, MMB_KEY_BITS); + + // Iterate over full-sized blocks. + for (u32 i = ROUNDDOWN_N(begin, MMB_KEY_BITS), e = MIN(end, last_block); + i < e; i += MMB_KEY_BITS) { + u8 *block_ptr = bits + i / 8; + MMB_TYPE block = mmb_load(block_ptr); + MMB_TYPE mask = get_flat_masks(i, begin, end); + mmb_store(block_ptr, block & ~mask); + } + + // Final block, which is less than full-sized. + if (end > last_block) { + u8 *block_ptr = bits + last_block / 8; + u32 num_bits = total_bits - last_block; + MMB_TYPE block = mmbit_get_flat_block(block_ptr, num_bits); + MMB_TYPE mask = get_flat_masks(last_block, begin, end); + mmb_store_partial(block_ptr, block & ~mask, num_bits); + } +} + +static really_inline +void mmbit_unset_range_big(u8 *bits, const u32 total_bits, u32 begin, + u32 end) { + // TODO: combine iterator and unset operation; completely replace this + u32 i = begin; + for (;;) { + i = mmbit_iterate_bounded(bits, total_bits, i, end); + if (i == MMB_INVALID) { + break; + } + mmbit_unset_big(bits, total_bits, i); + if (++i == end) { + break; + } + } +} + +/** \brief Unset a whole range of bits. Ensures that all bits between \a begin + * (inclusive) and \a end (exclusive) are switched off. */ +static really_inline +void mmbit_unset_range(u8 *bits, const u32 total_bits, u32 begin, u32 end) { + MDEBUG_PRINTF("%p total_bits %u begin %u end %u\n", bits, total_bits, begin, + end); + assert(begin <= end); + assert(end <= total_bits); + if (mmbit_is_flat_model(total_bits)) { + mmbit_unset_range_flat(bits, total_bits, begin, end); + } else { + mmbit_unset_range_big(bits, total_bits, begin, end); + } + // No bits are on in [begin, end) once we're done. + assert(MMB_INVALID == mmbit_iterate_bounded(bits, total_bits, begin, end)); +} + +/** \brief Specialisation of \ref mmbit_init_range for flat models. */ +static really_inline +void mmbit_init_range_flat(u8 *bits, const u32 total_bits, u32 begin, u32 end) { + const u32 last_block = ROUNDDOWN_N(total_bits, MMB_KEY_BITS); + + // Iterate over full-sized blocks. + for (u32 i = 0; i < last_block; i += MMB_KEY_BITS) { + mmb_store(bits + i / 8, get_flat_masks(i, begin, end)); + } + + // Final block, which is less than full-sized. + if (total_bits % MMB_KEY_BITS) { + u32 num_bits = total_bits - last_block; + MMB_TYPE block = get_flat_masks(last_block, begin, end); + mmb_store_partial(bits + last_block / 8, block, num_bits); + } +} + +static really_inline +void mmbit_init_range_big(u8 *bits, const u32 total_bits, u32 begin, u32 end) { + u32 ks = mmbit_keyshift(total_bits); + u32 level = 0; + + for (;;) { + u8 *block = mmbit_get_level_root(bits, level); + u32 k1 = begin >> ks, k2 = end >> ks; + + // Summary blocks need to account for the runt block on the end. + if ((k2 << ks) != end) { + k2++; + } + + // Partial block to deal with beginning. + block += (k1 / MMB_KEY_BITS) * sizeof(MMB_TYPE); + if (k1 % MMB_KEY_BITS) { + u32 idx = k1 / MMB_KEY_BITS; + u32 block_end = (idx + 1) * MMB_KEY_BITS; + + // Because k1 % MMB_KEY_BITS != 0, we can avoid checking edge cases + // here (see the branch in mmb_mask_zero_to). + MMB_TYPE mask = MMB_ALL_ONES << (k1 % MMB_KEY_BITS); + + if (k2 < block_end) { + assert(k2 % MMB_KEY_BITS); + mask &= mmb_mask_zero_to_nocheck(k2 % MMB_KEY_BITS); + mmb_store(block, mask); + goto next_level; + } else { + mmb_store(block, mask); + k1 = block_end; + block += sizeof(MMB_TYPE); + } + } + + // Write blocks filled with ones until we get to the last block. + for (; k1 < (k2 & ~MMB_KEY_MASK); k1 += MMB_KEY_BITS) { + mmb_store(block, MMB_ALL_ONES); + block += sizeof(MMB_TYPE); + } + + // Final block. + if (likely(k1 < k2)) { + // Again, if k2 was at a block boundary, it would have been handled + // by the previous loop, so we know k2 % MMB_KEY_BITS != 0 and can + // avoid the branch in mmb_mask_zero_to here. + assert(k2 % MMB_KEY_BITS); + MMB_TYPE mask = mmb_mask_zero_to_nocheck(k2 % MMB_KEY_BITS); + mmb_store(block, mask); + } + + next_level: + if (ks == 0) { + break; // Last level is done, finished. + } + + ks -= MMB_KEY_SHIFT; + level++; + } +} + +/** \brief Initialises the multibit so that only the given range of bits are + * set. + * + * Ensures that all bits between \a begin (inclusive) and \a end (exclusive) + * are switched on. + */ +static really_inline +void mmbit_init_range(u8 *bits, const u32 total_bits, u32 begin, u32 end) { + MDEBUG_PRINTF("%p total_bits %u begin %u end %u\n", bits, total_bits, begin, + end); + assert(begin <= end); + assert(end <= total_bits); + + if (!total_bits) { + return; + } + + // Short cut for cases where we're not actually setting any bits; just + // clear the multibit. + if (begin == end) { + mmbit_clear(bits, total_bits); + return; + } + + if (mmbit_is_flat_model(total_bits)) { + mmbit_init_range_flat(bits, total_bits, begin, end); + } else { + mmbit_init_range_big(bits, total_bits, begin, end); + } + + assert(begin == end || + mmbit_iterate(bits, total_bits, MMB_INVALID) == begin); + assert(!end || begin == end || + mmbit_iterate(bits, total_bits, end - 1) == MMB_INVALID); +} + +/** \brief Determine the number of \ref mmbit_sparse_state elements required. + * */ +static really_inline +u32 mmbit_sparse_iter_state_size(u32 total_bits) { + if (mmbit_is_flat_model(total_bits)) { + return 2; + } + u32 levels = mmbit_maxlevel(total_bits); + return levels + 1; +} + +#ifdef DUMP_SUPPORT +// Dump function, defined in multibit.c. +void mmbit_sparse_iter_dump(const struct mmbit_sparse_iter *it, u32 total_bits); +#endif + +/** Internal: common loop used by mmbit_sparse_iter_{begin,next}_big. Returns + * matching next key given starting state, or MMB_INVALID. */ +static really_inline +u32 mmbit_sparse_iter_exec(const u8 *bits, u32 key, u32 *idx, u32 level, + const u32 max_level, struct mmbit_sparse_state *s, + const struct mmbit_sparse_iter *it_root, + const struct mmbit_sparse_iter *it) { + for (;;) { + MMB_TYPE block = s[level].mask; + if (block) { + u32 bit = mmb_ctz(block); + key = (key << MMB_KEY_SHIFT) + bit; + u32 bit_idx = mmbit_mask_index(bit, it->mask); + if (level++ == max_level) { + // we've found a key + *idx = it->val + bit_idx; + return key; + } else { + // iterator record is the start of the level (current it->val) + // plus N, where N is the dense index of the bit in the current + // level's itmask + u32 iter_key = it->val + bit_idx; + it = it_root + iter_key; + MMB_TYPE nextblock = + mmb_load(mmbit_get_level_root_const(bits, level) + + key * sizeof(MMB_TYPE)); + s[level].mask = nextblock & it->mask; + s[level].itkey = iter_key; + } + } else { + // No bits set in this block + if (level-- == 0) { + break; // no key available + } + key >>= MMB_KEY_SHIFT; + // Update state mask and iterator + s[level].mask &= (s[level].mask - 1); + it = it_root + s[level].itkey; + } + } + return MMB_INVALID; +} + +static really_inline +u32 mmbit_sparse_iter_begin_big(const u8 *bits, u32 total_bits, u32 *idx, + const struct mmbit_sparse_iter *it_root, + struct mmbit_sparse_state *s) { + const struct mmbit_sparse_iter *it = it_root; + u32 key = 0; + MMB_TYPE block = mmb_load(bits) & it->mask; + if (!block) { + return MMB_INVALID; + } + + // Load first block into top level state. + const u32 max_level = mmbit_maxlevel(total_bits); + s[0].mask = block; + s[0].itkey = 0; + return mmbit_sparse_iter_exec(bits, key, idx, 0, max_level, + s, it_root, it); +} + +/** \brief Specialisation of \ref mmbit_sparse_iter_begin for flat models. */ +static really_inline +u32 mmbit_sparse_iter_begin_flat(const u8 *bits, u32 total_bits, u32 *idx, + const struct mmbit_sparse_iter *it_root, + struct mmbit_sparse_state *s) { + // Small cases have everything in the root iterator mask. + if (total_bits <= MMB_KEY_BITS) { + MMB_TYPE block = mmbit_get_flat_block(bits, total_bits); + block &= it_root->mask; + if (!block) { + return MMB_INVALID; + } + + s->mask = block; + u32 key = mmb_ctz(block); + *idx = mmbit_mask_index(key, it_root->mask); + return key; + } + + // Otherwise, the root iterator mask tells us which blocks (which we lay out + // linearly in the flat model) could contain keys. + assert(mmbit_maxlevel(total_bits) == 1); // Should only be two levels + MMB_TYPE root = it_root->mask; + for (; root; root &= (root - 1)) { + u32 bit = mmb_ctz(root); + u32 bit_idx = mmbit_mask_index(bit, it_root->mask); + u32 iter_key = it_root->val + bit_idx; + const struct mmbit_sparse_iter *it = it_root + iter_key; + u32 block_key_min = bit * MMB_KEY_BITS; + u32 block_key_max = block_key_min + MMB_KEY_BITS; + MMB_TYPE block; + if (block_key_max > total_bits) { + block_key_max = total_bits; + block = mmbit_get_flat_block(bits + (bit * sizeof(MMB_TYPE)), + block_key_max - block_key_min); + } else { + block = mmb_load(bits + (bit * sizeof(MMB_TYPE))); + } + + block &= it->mask; + if (block) { + s[0].mask = root; + s[1].mask = block; + s[1].itkey = iter_key; + u32 key = mmb_ctz(block); + *idx = it->val + mmbit_mask_index(key, it->mask); + return key + block_key_min; + } + } + + return MMB_INVALID; +} + +/** \brief Sparse iterator, find first key. + * + * Returns the first of the bits specified by the iterator \a it_root that is + * on, and initialises the state \a s. If none of the bits specified by the + * iterator are on, returns MMB_INVALID. + */ +static really_inline +u32 mmbit_sparse_iter_begin(const u8 *bits, u32 total_bits, u32 *idx, + const struct mmbit_sparse_iter *it_root, + struct mmbit_sparse_state *s) { + assert(ISALIGNED_N(it_root, alignof(struct mmbit_sparse_iter))); + + // Our state _may_ be on the stack +#ifndef _WIN32 + assert(ISALIGNED_N(s, alignof(struct mmbit_sparse_state))); +#else + assert(ISALIGNED_N(s, 4)); +#endif + + MDEBUG_PRINTF("%p total_bits %u\n", bits, total_bits); + // iterator should have _something_ at the root level + assert(it_root->mask != 0); + u32 key; + if (mmbit_is_flat_model(total_bits)) { + key = mmbit_sparse_iter_begin_flat(bits, total_bits, idx, it_root, s); + } else { + key = mmbit_sparse_iter_begin_big(bits, total_bits, idx, it_root, s); + } + if (key != MMB_INVALID) { + assert(key < total_bits); + assert(mmbit_isset(bits, total_bits, key)); + } + return key; +} + +static really_inline +u32 mmbit_sparse_iter_next_big(const u8 *bits, u32 total_bits, u32 last_key, + u32 *idx, + const struct mmbit_sparse_iter *it_root, + struct mmbit_sparse_state *s) { + const u32 max_level = mmbit_maxlevel(total_bits); + u32 key = last_key >> MMB_KEY_SHIFT; + s[max_level].mask &= (s[max_level].mask - 1); + const struct mmbit_sparse_iter *it = it_root + s[max_level].itkey; + return mmbit_sparse_iter_exec(bits, key, idx, max_level, max_level, s, + it_root, it); +} + +/** \brief Specialisation of \ref mmbit_sparse_iter_next for flat models. */ +static really_inline +u32 mmbit_sparse_iter_next_flat(const u8 *bits, const u32 total_bits, u32 *idx, + const struct mmbit_sparse_iter *it_root, + struct mmbit_sparse_state *s) { + if (total_bits <= MMB_KEY_BITS) { + // All of our data is already in the s->mask, so we just need to scrape + // off the next match. + s->mask &= (s->mask - 1); + if (s->mask) { + u32 key = mmb_ctz(s->mask); + *idx = mmbit_mask_index(key, it_root->mask); + return key; + } + } else { + assert(s[0].mask); + + s[1].mask &= (s[1].mask - 1); // Remove previous key from iter state. + u32 bit = mmb_ctz(s[0].mask); // Flat block currently being accessed. + + for (;;) { + if (s[1].mask) { + u32 key = mmb_ctz(s[1].mask); + const struct mmbit_sparse_iter *it = it_root + s[1].itkey; + *idx = it->val + mmbit_mask_index(key, it->mask); + key += (bit * MMB_KEY_BITS); + return key; + } + + // Otherwise, we have no keys left in this block. Consult the root + // mask and find the next one. + + s[0].mask &= s[0].mask - 1; + if (!s[0].mask) { + break; + } + + bit = mmb_ctz(s[0].mask); + u32 bit_idx = mmbit_mask_index(bit, it_root->mask); + u32 iter_key = it_root->val + bit_idx; + const struct mmbit_sparse_iter *it = it_root + iter_key; + u32 block_key_min = bit * MMB_KEY_BITS; + u32 block_key_max = block_key_min + MMB_KEY_BITS; + MMB_TYPE block; + if (block_key_max > total_bits) { + block_key_max = total_bits; + block = mmbit_get_flat_block(bits + (bit * sizeof(MMB_TYPE)), + block_key_max - block_key_min); + } else { + block = mmb_load(bits + (bit * sizeof(MMB_TYPE))); + } + + s[1].mask = block & it->mask; + s[1].itkey = iter_key; + } + } + + return MMB_INVALID; +} + +/** \brief Sparse iterator, find next key. + * + * Takes in a sparse iterator tree structure \a it_root and a state array, and + * finds the next on bit (from the set of bits specified in the iterator). + * + * NOTE: The sparse iterator stores copies of the multibit blocks in its state, + * so it is not necessarily safe to set or unset bits in the multibit while + * iterating: the changes you make may or may not be taken into account + * by the iterator. + */ +static really_inline +u32 mmbit_sparse_iter_next(const u8 *bits, u32 total_bits, u32 last_key, + u32 *idx, const struct mmbit_sparse_iter *it_root, + struct mmbit_sparse_state *s) { + assert(ISALIGNED_N(it_root, alignof(struct mmbit_sparse_iter))); + + // Our state _may_ be on the stack +#ifndef _WIN32 + assert(ISALIGNED_N(s, alignof(struct mmbit_sparse_state))); +#else + assert(ISALIGNED_N(s, 4)); +#endif + + MDEBUG_PRINTF("%p total_bits %u\n", bits, total_bits); + MDEBUG_PRINTF("NEXT (total_bits=%u, last_key=%u)\n", total_bits, last_key); + UNUSED u32 last_idx = *idx; // for assertion at the end + // our iterator should have _something_ at the root level + assert(it_root->mask != 0); + assert(last_key < total_bits); + + u32 key; + if (mmbit_is_flat_model(total_bits)) { + key = mmbit_sparse_iter_next_flat(bits, total_bits, idx, it_root, s); + } else { + key = mmbit_sparse_iter_next_big(bits, total_bits, last_key, idx, + it_root, s); + } + if (key != MMB_INVALID) { + MDEBUG_PRINTF("END NEXT: key=%u, idx=%u\n", key, *idx); + assert(key < total_bits); + assert(key > last_key); + assert(mmbit_isset(bits, total_bits, key)); + assert(*idx > last_idx); + } else { + MDEBUG_PRINTF("END NEXT: no more keys\n"); + } + return key; +} + +/** \brief Specialisation of \ref mmbit_sparse_iter_unset for flat models. */ +static really_inline +void mmbit_sparse_iter_unset_flat(u8 *bits, u32 total_bits, + const struct mmbit_sparse_iter *it_root) { + if (total_bits <= MMB_KEY_BITS) { + // Everything is in the root mask: we can just mask those bits off. + MMB_TYPE block = mmbit_get_flat_block(bits, total_bits); + block &= ~it_root->mask; + mmb_store_partial(bits, block, total_bits); + return; + } + + // Larger case, we have two iterator levels to worry about. + u32 bit_idx = 0; + for (MMB_TYPE root = it_root->mask; root; root &= (root - 1), bit_idx++) { + u32 bit = mmb_ctz(root); + u32 block_key_min = bit * MMB_KEY_BITS; + u32 block_key_max = block_key_min + MMB_KEY_BITS; + u8 *block_ptr = bits + (bit * sizeof(MMB_TYPE)); + u32 iter_key = it_root->val + bit_idx; + const struct mmbit_sparse_iter *it = it_root + iter_key; + if (block_key_max <= total_bits) { + // Full-sized block. + MMB_TYPE block = mmb_load(block_ptr); + block &= ~it->mask; + mmb_store(block_ptr, block); + } else { + // Runt (final) block. + u32 num_bits = total_bits - block_key_min; + MMB_TYPE block = mmbit_get_flat_block(block_ptr, num_bits); + block &= ~it->mask; + mmb_store_partial(block_ptr, block, num_bits); + break; // We know this is the last block. + } + } +} + +static really_inline +void mmbit_sparse_iter_unset_big(u8 *bits, u32 total_bits, + const struct mmbit_sparse_iter *it_root, + struct mmbit_sparse_state *s) { + const struct mmbit_sparse_iter *it = it_root; + MMB_TYPE block = mmb_load(bits) & it->mask; + if (!block) { + return; + } + + u32 key = 0; + const u32 max_level = mmbit_maxlevel(total_bits); + u32 level = 0; + + // Load first block into top level state + s[level].mask = block; + s[level].itkey = 0; + for (;;) { + block = s[level].mask; + if (block) { + if (level == max_level) { + // bottom level block: we want to mask out the bits specified + // by the iterator mask and then go back up a level. + u8 *block_ptr = + mmbit_get_level_root(bits, level) + key * sizeof(MMB_TYPE); + MMB_TYPE real_block = mmb_load(block_ptr); + real_block &= ~(it->mask); + mmb_store(block_ptr, real_block); + goto uplevel; // still cheap and nasty + } else { + u32 bit = mmb_ctz(block); + key = (key << MMB_KEY_SHIFT) + bit; + level++; + + // iterator record is the start of the level (current it->val) + // plus N, where N is the dense index of the bit in the current + // level's itmask + u32 iter_key = it->val + mmbit_mask_index(bit, it->mask); + it = it_root + iter_key; + MMB_TYPE nextblock = + mmb_load(mmbit_get_level_root_const(bits, level) + + key * sizeof(MMB_TYPE)); + s[level].mask = nextblock & it->mask; + s[level].itkey = iter_key; + } + } else { +uplevel: + // No bits set in this block + if (level == 0) { + return; // we are done + } + u8 *block_ptr = + mmbit_get_level_root(bits, level) + key * sizeof(MMB_TYPE); + MMB_TYPE real_block = mmb_load(block_ptr); + key >>= MMB_KEY_SHIFT; + level--; + + if (real_block == 0) { + // If we've zeroed our block For Real (unmasked by iterator), + // we can clear the parent bit that led us to it, so that + // we don't go down this particular garden path again later. + u32 bit = mmb_ctz(s[level].mask); + u8 *parent_ptr = + mmbit_get_level_root(bits, level) + key * sizeof(MMB_TYPE); + MMB_TYPE parent_block = mmb_load(parent_ptr); + mmb_clear(&parent_block, bit); + mmb_store(parent_ptr, parent_block); + } + + // Update state mask and iterator + s[level].mask &= (s[level].mask - 1); + it = it_root + s[level].itkey; + } + } +} + +/** \brief Sparse iterator, unset all bits. + * + * Takes in a sparse iterator tree structure and switches off any entries found + * therein. + */ +static really_inline +void mmbit_sparse_iter_unset(u8 *bits, u32 total_bits, + const struct mmbit_sparse_iter *it, + struct mmbit_sparse_state *s) { + assert(ISALIGNED_N(it, alignof(struct mmbit_sparse_iter))); + + // Our state _may_ be on the stack +#ifndef _WIN32 + assert(ISALIGNED_N(s, alignof(struct mmbit_sparse_state))); +#else + assert(ISALIGNED_N(s, 4)); +#endif + + MDEBUG_PRINTF("%p total_bits %u\n", bits, total_bits); + +#ifdef MMB_TRACE_WRITES + MMB_TRACE("ITER-UNSET iter=["); + mmbit_sparse_iter_dump(it, total_bits); + printf("] actually on=["); + struct mmbit_sparse_state tmp[MAX_SPARSE_ITER_STATES]; + u32 idx = 0; + u32 i = mmbit_sparse_iter_begin(bits, total_bits, &idx, it, tmp); + for (; i != MMB_INVALID; + i = mmbit_sparse_iter_next(bits, total_bits, i, &idx, it, tmp)) { + printf(" %u", i); + } + printf("]\n"); +#endif + + if (mmbit_is_flat_model(total_bits)) { + mmbit_sparse_iter_unset_flat(bits, total_bits, it); + } else { + mmbit_sparse_iter_unset_big(bits, total_bits, it, s); + } +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // MULTIBIT_H diff --git a/regex/util/multibit_compress.h b/regex/util/multibit_compress.h new file mode 100644 index 000000000..e7b4fd8e8 --- /dev/null +++ b/regex/util/multibit_compress.h @@ -0,0 +1,204 @@ +/* + * Copyright (c) 2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** file + * \brief multibit compression API: compress / decompress / size + */ + +#ifndef MULTIBIT_COMPRESS_H +#define MULTIBIT_COMPRESS_H + +#include "multibit.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** \brief size API. */ +static really_inline +size_t mmbit_compsize(const u8 *bits, u32 total_bits) { + // Deal with flat model. + if (total_bits <= MMB_FLAT_MAX_BITS) { + return (ROUNDUP_N(total_bits, 8) / 8); + } + // Deal with all cleared mmb. + if (mmb_load(bits) == 0) { + return sizeof(MMB_TYPE); + } + // Deal with normal pyramid mmb. + const u32 max_level = mmbit_maxlevel(total_bits); + u32 level = 0; + u32 key = 0; + u32 key_rem = 0; + u32 num_block = 0; + // Iteration-version of DFS + while (1) { + if (key_rem < MMB_KEY_BITS) { + const u8 *block_ptr = mmbit_get_level_root_const(bits, level) + + key * sizeof(MMB_TYPE); + MMB_TYPE block = mmb_load(block_ptr); + MMB_TYPE block_1 = block & ~mmb_mask_zero_to_nocheck(key_rem); + if (mmb_popcount(block) == mmb_popcount(block_1)) { + num_block++; + } + if (level < max_level && block_1) { + key = (key << MMB_KEY_SHIFT) + mmb_ctz(block_1); + key_rem = 0; + level++; + continue; + } + } + if (level-- == 0) { + return sizeof(MMB_TYPE) * num_block; + } + key_rem = (key & MMB_KEY_MASK) + 1; + key >>= MMB_KEY_SHIFT; + } +} + +/** \brief compress API. */ +static really_inline +char mmbit_compress(const u8 *bits, u32 total_bits, u8 *comp, + size_t *comp_space, size_t max_comp_space) { + UNUSED u8 *comp_init = comp; + // Compute comp_size first. + size_t comp_size = mmbit_compsize(bits, total_bits); + // Check whether out of writable range. + if (comp_size > max_comp_space) { + return 0; + } + *comp_space = comp_size; // Return comp_size outside. + // Deal with flat model. + if (total_bits <= MMB_FLAT_MAX_BITS) { + memcpy(comp, bits, comp_size); + return 1; + } + // Deal with all cleared mmb. + if (mmb_load(bits) == 0) { + memcpy(comp, bits, sizeof(MMB_TYPE)); + return 1; + } + // Deal with normal pyramid mmb. + const u32 max_level = mmbit_maxlevel(total_bits); + u32 level = 0; + u32 key = 0; + u32 key_rem = 0; + // Iteration-version of DFS + while (1) { + if (key_rem < MMB_KEY_BITS) { + const u8 *block_ptr = mmbit_get_level_root_const(bits, level) + + key * sizeof(MMB_TYPE); + MMB_TYPE block = mmb_load(block_ptr); + MMB_TYPE block_1 = block & ~mmb_mask_zero_to_nocheck(key_rem); + if (mmb_popcount(block) == mmb_popcount(block_1)) { + memcpy(comp, &block, sizeof(MMB_TYPE)); + comp += sizeof(MMB_TYPE); + } + if (level < max_level && block_1) { + key = (key << MMB_KEY_SHIFT) + mmb_ctz(block_1); + key_rem = 0; + level++; + continue; + } + } + if (level-- == 0) { + break; + } + key_rem = (key & MMB_KEY_MASK) + 1; + key >>= MMB_KEY_SHIFT; + } + assert((u32)(comp - comp_init) == comp_size); + return 1; +} + +/** \brief decompress API. */ +static really_inline +char mmbit_decompress(u8 *bits, u32 total_bits, const u8 *comp, + size_t *comp_space, size_t max_comp_space) { + UNUSED const u8 *comp_init = comp; + size_t comp_size; + // Deal with flat model. + if (total_bits <= MMB_FLAT_MAX_BITS) { + comp_size = ROUNDUP_N(total_bits, 8) / 8; + memcpy(bits, comp, comp_size); + *comp_space = comp_size; + return 1; + } + // Deal with all cleared mmb. + if (mmb_load(comp) == 0) { + comp_size = sizeof(MMB_TYPE); + memcpy(bits, comp, comp_size); + *comp_space = comp_size; + return 1; + } + // Deal with normal mmb. + u32 max_level = mmbit_maxlevel(total_bits); + u32 level = 0; + u32 key = 0; + u32 key_rem = 0; + UNUSED const u8 *comp_end = comp_init + max_comp_space; + // Iteration-version of DFS + memcpy(bits, comp, sizeof(MMB_TYPE)); // Copy root block first. + comp += sizeof(MMB_TYPE); + while (1) { + if (key_rem < MMB_KEY_BITS) { + u8 *block_ptr = mmbit_get_level_root(bits, level) + + key * sizeof(MMB_TYPE); + MMB_TYPE block = mmb_load(block_ptr); + MMB_TYPE block_1 = block & ~mmb_mask_zero_to_nocheck(key_rem); + if (level < max_level && block_1) { + key = (key << MMB_KEY_SHIFT) + mmb_ctz(block_1); + u8 *block_ptr_1 = mmbit_get_level_root(bits, level + 1) + + key * sizeof(MMB_TYPE); + memcpy(block_ptr_1, comp, sizeof(MMB_TYPE)); + comp += sizeof(MMB_TYPE); + if (comp > comp_end) { + return 0; // Out of buffer. + } + key_rem = 0; + level++; + continue; + } + } + if (level-- == 0) { + break; + } + key_rem = (key & MMB_KEY_MASK) + 1; + key >>= MMB_KEY_SHIFT; + } + comp_size = (u32)(comp - comp_init); + *comp_space = comp_size; + return 1; +} + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // MULTBIT_COMPRESS_H + diff --git a/regex/util/multibit_internal.h b/regex/util/multibit_internal.h new file mode 100644 index 000000000..350f3bfd4 --- /dev/null +++ b/regex/util/multibit_internal.h @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2015-2016, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Multibit: data structures. + * + * If all you need is the sizes of multibit's few structures, then including + * this file is a much better idea than including all of multibit.h. + */ +#ifndef MULTIBIT_INTERNAL_H +#define MULTIBIT_INTERNAL_H + +#include "ue2common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** \brief Sentinel value meaning "no key found". */ +#define MMB_INVALID 0xffffffffu + +typedef u64a MMB_TYPE; /**< Basic block type for mmbit operations. */ +#define MMB_MAX_LEVEL 6 /**< Maximum level in the mmbit pyramid. */ + +/** \brief Maximum number of keys (bits) in a multibit. */ +#define MMB_MAX_BITS (1U << 31) + +/** \brief Sparse iterator record type. + * + * A sparse iterator is a tree of these records, where val identifies the + * offset of the result for leaf nodes and points to the next record for + * intermediate nodes. Built by the code in multibit_build.cpp. + */ +struct mmbit_sparse_iter { + MMB_TYPE mask; + u32 val; +}; + +/** \brief Sparse iterator runtime state type. + * + * An array of these records (one per "level" in the multibit pyramid) is used + * to store the current iteration state. + */ +struct mmbit_sparse_state { + MMB_TYPE mask; //!< \brief masked last block read at this level. + u32 itkey; //!< \brief iterator offset for this level. +}; + +/** \brief Maximum number of \ref mmbit_sparse_state that could be needed. */ +#define MAX_SPARSE_ITER_STATES (6 + 1) + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // MULTIBIT_INTERNAL_H diff --git a/regex/util/pack_bits.h b/regex/util/pack_bits.h new file mode 100644 index 000000000..800ce25ec --- /dev/null +++ b/regex/util/pack_bits.h @@ -0,0 +1,227 @@ +/* + * Copyright (c) 2015, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Functions for packing/unpacking arrays. + */ + +#ifndef UTIL_PACK_BITS_H +#define UTIL_PACK_BITS_H + +#include "ue2common.h" +#include "unaligned.h" +#include "partial_store.h" + +/** + * \brief Pack bits from an array of 32-bit words into \a out. + * + * \param out Output array. Must be large enough to store sum(bits). + * \param v Input array. + * \param bits Number of low bits in the corresponding element of \a v to pack. + * \param elements Size of the \a v and \a bits arrays. + */ +static really_inline +void pack_bits_32(char *out, const u32 *v, const u32 *bits, + const unsigned int elements); + +/** + * \brief Pack bits from an array of 64-bit words into \a out. + * + * \param out Output array. Must be large enough to store sum(bits). + * \param v Input array. + * \param bits Number of low bits in the corresponding element of \a v to pack. + * \param elements Size of the \a v and \a bits arrays. + */ +static really_inline +void pack_bits_64(char *out, const u64a *v, const u32 *bits, + const unsigned int elements); + +/** + * \brief Unpack bits into an array of 32-bit words according to the counts + * given. + * + * \param v Output array. + * \param in Packed input array. + * \param bits Number of bits to unpack into the corresponding element of \a v. + * \param elements Size of the \a v and \a bits arrays. + */ +static really_inline +void unpack_bits_32(u32 *v, const u8 *in, const u32 *bits, + const unsigned int elements); + +/** + * \brief Unpack bits into an array of 64-bit words according to the counts + * given. + * + * \param v Output array. + * \param in Packed input array. + * \param bits Number of bits to unpack into the corresponding element of \a v. + * \param elements Size of the \a v and \a bits arrays. + */ +static really_inline +void unpack_bits_64(u64a *v, const u8 *in, const u32 *bits, + const unsigned int elements); + +/* + * Inline implementations follow. + */ + +static really_inline +void pack_bits_32(char *out, const u32 *v, const u32 *bits, + const unsigned int elements) { + u32 write = 0; // accumulator + u32 idx = 0; // acc holds this many bits + + for (unsigned int i = 0; i < elements; i++) { + assert(bits[i] <= 32); + write |= (v[i] << idx); + idx += bits[i]; + if (idx >= 32) { + unaligned_store_u32(out, write); + out += 4; + idx -= 32; + u32 leftover = bits[i] - idx; + if (leftover == 32) { + write = 0; + } else { + assert(leftover < 32); + write = v[i] >> leftover; + } + } + } + + // There might be a write left over. + partial_store_u32(out, write, (idx + 7) / 8); +} + +static really_inline +void pack_bits_64(char *out, const u64a *v, const u32 *bits, + const unsigned int elements) { + u64a write = 0; // accumulator + u32 idx = 0; // acc holds this many bits + + for (unsigned int i = 0; i < elements; i++) { + assert(bits[i] <= 64); + write |= (v[i] << idx); + idx += bits[i]; + if (idx >= 64) { + unaligned_store_u64a(out, write); + out += 8; + idx -= 64; + u32 leftover = bits[i] - idx; + if (leftover == 64) { + write = 0; + } else { + assert(leftover < 64); + write = v[i] >> leftover; + } + } + } + + // There might be a write left over. + DEBUG_PRINTF("partial store of idx=%u\n", idx); + partial_store_u64a(out, write, (idx + 7) / 8); +} + +static really_inline +void unpack_bits_32(u32 *v, const u8 *in, const u32 *bits, + const unsigned int elements) { + u32 used = 0; // bits used from *in + + for (unsigned int i = 0; i < elements; i++) { + assert(bits[i] <= 32); + u32 v_out = 0; // accumulator for v[i] + u32 b = bits[i]; // bits left to read for v[i] + u32 vidx = 0; // bits written to v[i] + + while (b) { + u32 read = *in >> used; + u32 bits_read = 8 - used; + + if (b <= bits_read) { + u32 mask = read & ((1U << b) - 1); + v_out |= mask << vidx; + vidx += b; + used += b; + b = 0; + if (used < 8) { + continue; // more from this *in + } + } else { + v_out |= read << vidx; + vidx += bits_read; + b -= bits_read; + } + + used = 0; + in++; + } + + v[i] = v_out; + } +} + +static really_inline +void unpack_bits_64(u64a *v, const u8 *in, const u32 *bits, + const unsigned int elements) { + u32 used = 0; // bits used from *in + + for (unsigned int i = 0; i < elements; i++) { + assert(bits[i] <= 64); + u64a v_out = 0; // accumulator for v[i] + u32 b = bits[i]; // bits left to read for v[i] + u32 vidx = 0; // bits written to v[i] + + while (b) { + u64a read = *in >> used; + u32 bits_read = 8 - used; + + if (b <= bits_read) { + u64a mask = read & ((1U << b) - 1); + v_out |= mask << vidx; + vidx += b; + used += b; + b = 0; + if (used < 8) { + continue; // more from this *in + } + } else { + v_out |= read << vidx; + vidx += bits_read; + b -= bits_read; + } + + used = 0; + in++; + } + + v[i] = v_out; + } +} + +#endif // UTIL_PACK_BITS_H diff --git a/regex/util/partial_store.h b/regex/util/partial_store.h new file mode 100644 index 000000000..a49d1fae1 --- /dev/null +++ b/regex/util/partial_store.h @@ -0,0 +1,163 @@ +/* + * Copyright (c) 2015, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef PARTIAL_STORE_H +#define PARTIAL_STORE_H + +#include "ue2common.h" +#include "unaligned.h" + +/* loads/stores the least significant bytes of the values. */ + +static really_inline +void partial_store_u32(void *ptr, u32 value, u32 numBytes) { + assert(numBytes <= 4); + switch (numBytes) { + case 4: + unaligned_store_u32(ptr, value); + break; + case 3: + unaligned_store_u16(ptr, (u16)value); + *((u8 *)ptr + 2) = (u8)(value >> 16); + break; + case 2: + unaligned_store_u16(ptr, (u16)value); + break; + case 1: + *(u8 *)ptr = (u8)value; + break; + case 0: + break; + } +} + +static really_inline +u32 partial_load_u32(const void *ptr, u32 numBytes) { + u32 value; + assert(numBytes <= 4); + switch (numBytes) { + case 4: + value = unaligned_load_u32(ptr); + return value; + case 3: + value = unaligned_load_u16(ptr); + value |= ((u32)(*((const u8 *)ptr + 2)) << 16); + return value; + case 2: + value = unaligned_load_u16(ptr); + return value; + case 1: + value = *(const u8 *)ptr; + return value; + case 0: + break; + } + + return 0; +} + +static really_inline +void partial_store_u64a(void *ptr, u64a value, u32 numBytes) { + assert(numBytes <= 8); + switch (numBytes) { + case 8: + unaligned_store_u64a(ptr, value); + break; + case 7: + unaligned_store_u32(ptr, (u32)value); + unaligned_store_u16((u8 *)ptr + 4, (u16)(value >> 32)); + *((u8 *)ptr + 6) = (u8)(value >> 48); + break; + case 6: + unaligned_store_u32(ptr, (u32)value); + unaligned_store_u16((u8 *)ptr + 4, (u16)(value >> 32)); + break; + case 5: + unaligned_store_u32(ptr, (u32)value); + *((u8 *)ptr + 4) = (u8)(value >> 32); + break; + case 4: + unaligned_store_u32(ptr, (u32)value); + break; + case 3: + unaligned_store_u16(ptr, (u16)value); + *((u8 *)ptr + 2) = (u8)(value >> 16); + break; + case 2: + unaligned_store_u16(ptr, (u16)value); + break; + case 1: + *(u8 *)ptr = (u8)value; + break; + case 0: + break; + } +} + +static really_inline +u64a partial_load_u64a(const void *ptr, u32 numBytes) { + u64a value; + assert(numBytes <= 8); + switch (numBytes) { + case 8: + value = unaligned_load_u64a(ptr); + return value; + case 7: + value = unaligned_load_u32(ptr); + value |= (u64a)unaligned_load_u16((const u8 *)ptr + 4) << 32; + value |= (u64a)(*((const u8 *)ptr + 6)) << 48; + return value; + case 6: + value = unaligned_load_u32(ptr); + value |= (u64a)unaligned_load_u16((const u8 *)ptr + 4) << 32; + return value; + case 5: + value = unaligned_load_u32(ptr); + value |= (u64a)(*((const u8 *)ptr + 4)) << 32; + return value; + case 4: + value = unaligned_load_u32(ptr); + return value; + case 3: + value = unaligned_load_u16(ptr); + value |= (u64a)(*((const u8 *)ptr + 2)) << 16; + return value; + case 2: + value = unaligned_load_u16(ptr); + return value; + case 1: + value = *(const u8 *)ptr; + return value; + case 0: + break; + } + + return 0; +} + +#endif diff --git a/regex/util/popcount.h b/regex/util/popcount.h new file mode 100644 index 000000000..eb08f6b1b --- /dev/null +++ b/regex/util/popcount.h @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Platform specific popcount functions + */ + +#ifndef UTIL_POPCOUNT_H_ +#define UTIL_POPCOUNT_H_ + +#include "ue2common.h" +#include "util/arch.h" + +static really_inline +u32 popcount32(u32 x) { +#if defined(HAVE_POPCOUNT_INSTR) + // Single-instruction builtin. + return _mm_popcnt_u32(x); +#else + // Fast branch-free version from bit-twiddling hacks as older Intel + // processors do not have a POPCNT instruction. + x -= (x >> 1) & 0x55555555; + x = (x & 0x33333333) + ((x >> 2) & 0x33333333); + return (((x + (x >> 4)) & 0xf0f0f0f) * 0x1010101) >> 24; +#endif +} + +static really_inline +u32 popcount64(u64a x) { +#if defined(ARCH_X86_64) +# if defined(HAVE_POPCOUNT_INSTR) + // Single-instruction builtin. + return (u32)_mm_popcnt_u64(x); +# else + // Fast branch-free version from bit-twiddling hacks as older Intel + // processors do not have a POPCNT instruction. + x -= (x >> 1) & 0x5555555555555555; + x = (x & 0x3333333333333333) + ((x >> 2) & 0x3333333333333333); + x = (x + (x >> 4)) & 0x0f0f0f0f0f0f0f0f; + return (x * 0x0101010101010101) >> 56; +# endif +#else + // Synthesise from two 32-bit cases. + return popcount32(x >> 32) + popcount32(x); +#endif +} + +#endif /* UTIL_POPCOUNT_H_ */ + diff --git a/regex/util/pqueue.h b/regex/util/pqueue.h new file mode 100644 index 000000000..f0ba12e70 --- /dev/null +++ b/regex/util/pqueue.h @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2015, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef PQUEUE_H +#define PQUEUE_H + +#include "ue2common.h" + +static really_inline u32 +pq_left(u32 i) { + return (i << 1) + 1; +} + +static really_inline u32 +pq_right(u32 i) { + return (i << 1) + 2; +} + +static really_inline +u32 pq_parent(u32 i) { + return (i - 1) >> 1; +} + +static really_inline +void pq_sift(PQ_T *items, u32 start, u32 end) { + u32 j = start; + PQ_T j_temp = items[j]; + + while (pq_left(j) < end) { + u32 max_child; + + if (pq_right(j) < end && PQ_COMP(items, pq_right(j), pq_left(j))) { + max_child = pq_right(j); + } else { + max_child = pq_left(j); + } + + if (PQ_COMP_B(items, max_child, j_temp)) { + items[j] = items[max_child]; + j = max_child; + } else { + /* j is already less than its children. We know heap property + * is already maintained for children we are done */ + break; + } + } + items[j] = j_temp; +} + +static really_inline +PQ_T *pq_top(PQ_T *items) { + return items; +} + +static really_inline +void pq_pop(PQ_T *items, u32 item_count) { + item_count--; + items[0] = items[item_count]; + pq_sift(items, 0, item_count); +} + +static really_inline +void pq_insert(PQ_T *items, u32 item_count, PQ_T new_item) { + u32 pos = item_count; + while (pos) { + u32 parent = pq_parent(pos); + if (!PQ_COMP_B(items, parent, new_item)) { + items[pos] = items[parent]; + pos = parent; + } else { + break; + } + } + items[pos] = new_item; +} + +static really_inline +void pq_replace_top(PQ_T *items, u32 item_count, PQ_T new_item) { + items[0] = new_item; + pq_sift(items, 0, item_count); +} + +#endif + diff --git a/regex/util/scatter.h b/regex/util/scatter.h new file mode 100644 index 000000000..40a1ab248 --- /dev/null +++ b/regex/util/scatter.h @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2015, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef UTIL_SCATTER_H +#define UTIL_SCATTER_H + +#include "ue2common.h" + +#define SCATTER_STRUCT(t) \ + struct scatter_unit_##t { u32 offset; t val; }; + +SCATTER_STRUCT(u64a) +SCATTER_STRUCT(u32) +SCATTER_STRUCT(u16) +SCATTER_STRUCT(u8) + +struct scatter_full_plan { + u32 s_u64a_offset; + u32 s_u64a_count; + u32 s_u32_offset; + u32 s_u32_count; + u32 s_u16_offset; + u32 s_u16_count; + u32 s_u8_count; + u32 s_u8_offset; +}; + +#undef SCATTER_STRUCT + +#endif diff --git a/regex/util/scatter_runtime.h b/regex/util/scatter_runtime.h new file mode 100644 index 000000000..09bc742d9 --- /dev/null +++ b/regex/util/scatter_runtime.h @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2015, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef UTIL_SCATTER_RUNTIME_H +#define UTIL_SCATTER_RUNTIME_H + +#include "scatter.h" + +#include "uniform_ops.h" + +#define SCATTER_DEF(t) \ +static really_inline \ +void scatter_##t(void *out, const struct scatter_unit_##t *plan, u32 count) { \ + for (u32 i = 0; i < count; i++) { \ + const struct scatter_unit_##t *item = plan + i; \ + DEBUG_PRINTF("storing %llu into offset %u\n", (u64a)item->val, \ + item->offset); \ + storeu_##t((char *)out + item->offset, item->val); \ + } \ +} + +SCATTER_DEF(u64a) +SCATTER_DEF(u32) +SCATTER_DEF(u16) +SCATTER_DEF(u8) + +#undef SCATTER_DEF + +static really_inline +void scatter(void *out, const void *base, const struct scatter_full_plan *p) { +#define RUN_SUB(t) \ + if (p->s_##t##_offset) { \ + assert(p->s_##t##_count); \ + const struct scatter_unit_##t *pp \ + = (const void *)(b + p->s_##t##_offset); \ + scatter_##t(out, pp, p->s_##t##_count); \ + } + + const char *b = base; + + RUN_SUB(u64a); + RUN_SUB(u32); + RUN_SUB(u16); + RUN_SUB(u8); + +#undef RUN_SUB +} + +#endif diff --git a/regex/util/simd_types.h b/regex/util/simd_types.h new file mode 100644 index 000000000..962cad6c9 --- /dev/null +++ b/regex/util/simd_types.h @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SIMD_TYPES_H +#define SIMD_TYPES_H + +#include "config.h" +#include "util/arch.h" +#include "util/intrinsics.h" +#include "ue2common.h" + +#if defined(HAVE_SSE2) +typedef __m128i m128; +#else +typedef struct ALIGN_DIRECTIVE {u64a hi; u64a lo;} m128; +#endif + +#if defined(HAVE_AVX2) +typedef __m256i m256; +#else +typedef struct ALIGN_AVX_DIRECTIVE {m128 lo; m128 hi;} m256; +#endif + +typedef struct {m128 lo; m128 mid; m128 hi;} m384; +#if defined(HAVE_AVX512) +typedef __m512i m512; +#else +typedef struct ALIGN_ATTR(64) {m256 lo; m256 hi;} m512; +#endif + +#endif /* SIMD_TYPES_H */ + diff --git a/regex/util/simd_utils.c b/regex/util/simd_utils.c new file mode 100644 index 000000000..25a81412e --- /dev/null +++ b/regex/util/simd_utils.c @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2016-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Lookup tables to support SIMD operations. + */ + +#include "simd_utils.h" + +ALIGN_CL_DIRECTIVE const char vbs_mask_data[] = { + 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, + 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, + + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + + 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, + 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, 0xf0, +}; + +#define ZEROES_8 0, 0, 0, 0, 0, 0, 0, 0 +#define ZEROES_31 ZEROES_8, ZEROES_8, ZEROES_8, 0, 0, 0, 0, 0, 0, 0 +#define ZEROES_32 ZEROES_8, ZEROES_8, ZEROES_8, ZEROES_8 + +/** \brief LUT for the mask1bit functions. */ +ALIGN_CL_DIRECTIVE const u8 simd_onebit_masks[] = { + ZEROES_32, ZEROES_32, + ZEROES_31, 0x01, ZEROES_32, + ZEROES_31, 0x02, ZEROES_32, + ZEROES_31, 0x04, ZEROES_32, + ZEROES_31, 0x08, ZEROES_32, + ZEROES_31, 0x10, ZEROES_32, + ZEROES_31, 0x20, ZEROES_32, + ZEROES_31, 0x40, ZEROES_32, + ZEROES_31, 0x80, ZEROES_32, + ZEROES_32, ZEROES_32, +}; diff --git a/regex/util/simd_utils.h b/regex/util/simd_utils.h new file mode 100644 index 000000000..d828f591b --- /dev/null +++ b/regex/util/simd_utils.h @@ -0,0 +1,1424 @@ +/* + * Copyright (c) 2015-2020, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief SIMD types and primitive operations. + */ + +#ifndef SIMD_UTILS +#define SIMD_UTILS + +#if !defined(_WIN32) && !defined(__SSSE3__) +#error SSSE3 instructions must be enabled +#endif + +#include "config.h" +#include "ue2common.h" +#include "simd_types.h" +#include "unaligned.h" +#include "util/arch.h" +#include "util/intrinsics.h" + +#ifndef __KERNEL__ +#include // for memcpy +#else +#include +#endif + +// Define a common assume_aligned using an appropriate compiler built-in, if +// it's available. Note that we need to handle C or C++ compilation. +#ifdef __cplusplus +# ifdef HAVE_CXX_BUILTIN_ASSUME_ALIGNED +# define assume_aligned(x, y) __builtin_assume_aligned((x), (y)) +# endif +#else +# ifdef HAVE_CC_BUILTIN_ASSUME_ALIGNED +# define assume_aligned(x, y) __builtin_assume_aligned((x), (y)) +# endif +#endif + +// Fallback to identity case. +#ifndef assume_aligned +#define assume_aligned(x, y) (x) +#endif + +#ifdef __cplusplus +extern "C" { +#endif +extern const char vbs_mask_data[]; +#ifdef __cplusplus +} +#endif + +static really_inline m128 ones128(void) { +#if defined(__GNUC__) || defined(__INTEL_COMPILER) + /* gcc gets this right */ + return _mm_set1_epi8(0xFF); +#else + /* trick from Intel's optimization guide to generate all-ones. + * ICC converts this to the single cmpeq instruction */ + return _mm_cmpeq_epi8(_mm_setzero_si128(), _mm_setzero_si128()); +#endif +} + +static really_inline m128 zeroes128(void) { + return _mm_setzero_si128(); +} + +/** \brief Bitwise not for m128*/ +static really_inline m128 not128(m128 a) { + return _mm_xor_si128(a, ones128()); +} + +/** \brief Return 1 if a and b are different otherwise 0 */ +static really_inline int diff128(m128 a, m128 b) { + return (_mm_movemask_epi8(_mm_cmpeq_epi8(a, b)) ^ 0xffff); +} + +static really_inline int isnonzero128(m128 a) { + return !!diff128(a, zeroes128()); +} + +/** + * "Rich" version of diff128(). Takes two vectors a and b and returns a 4-bit + * mask indicating which 32-bit words contain differences. + */ +static really_inline u32 diffrich128(m128 a, m128 b) { + a = _mm_cmpeq_epi32(a, b); + return ~(_mm_movemask_ps(_mm_castsi128_ps(a))) & 0xf; +} + +/** + * "Rich" version of diff128(), 64-bit variant. Takes two vectors a and b and + * returns a 4-bit mask indicating which 64-bit words contain differences. + */ +static really_inline u32 diffrich64_128(m128 a, m128 b) { +#if defined(HAVE_SSE41) + a = _mm_cmpeq_epi64(a, b); + return ~(_mm_movemask_ps(_mm_castsi128_ps(a))) & 0x5; +#else + u32 d = diffrich128(a, b); + return (d | (d >> 1)) & 0x5; +#endif +} + +static really_really_inline +m128 lshift64_m128(m128 a, unsigned b) { +#if defined(HAVE__BUILTIN_CONSTANT_P) + if (__builtin_constant_p(b)) { + return _mm_slli_epi64(a, b); + } +#endif + m128 x = _mm_cvtsi32_si128(b); + return _mm_sll_epi64(a, x); +} + +#define rshift64_m128(a, b) _mm_srli_epi64((a), (b)) +#define eq128(a, b) _mm_cmpeq_epi8((a), (b)) +#define movemask128(a) ((u32)_mm_movemask_epi8((a))) + +#if defined(HAVE_AVX512) +static really_inline m128 cast512to128(const m512 in) { + return _mm512_castsi512_si128(in); +} +#endif + +static really_inline m128 set16x8(u8 c) { + return _mm_set1_epi8(c); +} + +static really_inline m128 set4x32(u32 c) { + return _mm_set1_epi32(c); +} + +static really_inline u32 movd(const m128 in) { + return _mm_cvtsi128_si32(in); +} + +#if defined(HAVE_AVX512) +static really_inline u32 movd512(const m512 in) { + // NOTE: seems gcc doesn't support _mm512_cvtsi512_si32(in), + // so we use 2-step convertions to work around. + return _mm_cvtsi128_si32(_mm512_castsi512_si128(in)); +} + +static really_inline u64a movq512(const m512 in) { + // NOTE: seems AVX512 doesn't support _mm512_cvtsi512_si64(in), + // so we use 2-step convertions to work around. + return _mm_cvtsi128_si64(_mm512_castsi512_si128(in)); +} +#endif + +static really_inline u64a movq(const m128 in) { +#if defined(ARCH_X86_64) + return _mm_cvtsi128_si64(in); +#else // 32-bit - this is horrific + u32 lo = movd(in); + u32 hi = movd(_mm_srli_epi64(in, 32)); + return (u64a)hi << 32 | lo; +#endif +} + +/* another form of movq */ +static really_inline +m128 load_m128_from_u64a(const u64a *p) { + return _mm_set_epi64x(0LL, *p); +} + +#define rshiftbyte_m128(a, count_immed) _mm_srli_si128(a, count_immed) +#define lshiftbyte_m128(a, count_immed) _mm_slli_si128(a, count_immed) + +#if defined(HAVE_SSE41) +#define extract32from128(a, imm) _mm_extract_epi32(a, imm) +#define extract64from128(a, imm) _mm_extract_epi64(a, imm) +#else +#define extract32from128(a, imm) movd(_mm_srli_si128(a, imm << 2)) +#define extract64from128(a, imm) movq(_mm_srli_si128(a, imm << 3)) +#endif + +#if !defined(HAVE_AVX2) +// TODO: this entire file needs restructuring - this carveout is awful +#define extractlow64from256(a) movq(a.lo) +#define extractlow32from256(a) movd(a.lo) +#if defined(HAVE_SSE41) +#define extract32from256(a, imm) _mm_extract_epi32((imm >> 2) ? a.hi : a.lo, imm % 4) +#define extract64from256(a, imm) _mm_extract_epi64((imm >> 1) ? a.hi : a.lo, imm % 2) +#else +#define extract32from256(a, imm) movd(_mm_srli_si128((imm >> 2) ? a.hi : a.lo, (imm % 4) * 4)) +#define extract64from256(a, imm) movq(_mm_srli_si128((imm >> 1) ? a.hi : a.lo, (imm % 2) * 8)) +#endif + +#endif // !AVX2 + +static really_inline m128 and128(m128 a, m128 b) { + return _mm_and_si128(a,b); +} + +static really_inline m128 xor128(m128 a, m128 b) { + return _mm_xor_si128(a,b); +} + +static really_inline m128 or128(m128 a, m128 b) { + return _mm_or_si128(a,b); +} + +#if defined(HAVE_AVX512VBMI) +static really_inline m512 expand128(m128 a) { + return _mm512_broadcast_i32x4(a); +} + +static really_inline m512 expand256(m256 a) { + return _mm512_broadcast_i64x4(a); +} + +static really_inline m512 expand384(m384 a) { + u64a *lo = (u64a*)&a.lo; + u64a *mid = (u64a*)&a.mid; + u64a *hi = (u64a*)&a.hi; + return _mm512_set_epi64(0ULL, 0ULL, hi[1], hi[0], mid[1], mid[0], + lo[1], lo[0]); +} +#endif + +static really_inline m128 andnot128(m128 a, m128 b) { + return _mm_andnot_si128(a, b); +} + +// aligned load +static really_inline m128 load128(const void *ptr) { + assert(ISALIGNED_N(ptr, alignof(m128))); + ptr = assume_aligned(ptr, 16); + return _mm_load_si128((const m128 *)ptr); +} + +// aligned store +static really_inline void store128(void *ptr, m128 a) { + assert(ISALIGNED_N(ptr, alignof(m128))); + ptr = assume_aligned(ptr, 16); + *(m128 *)ptr = a; +} + +// unaligned load +static really_inline m128 loadu128(const void *ptr) { + return _mm_loadu_si128((const m128 *)ptr); +} + +// unaligned store +static really_inline void storeu128(void *ptr, m128 a) { + _mm_storeu_si128 ((m128 *)ptr, a); +} + +// packed unaligned store of first N bytes +static really_inline +void storebytes128(void *ptr, m128 a, unsigned int n) { + assert(n <= sizeof(a)); + memcpy(ptr, &a, n); +} + +// packed unaligned load of first N bytes, pad with zero +static really_inline +m128 loadbytes128(const void *ptr, unsigned int n) { + m128 a = zeroes128(); + assert(n <= sizeof(a)); + memcpy(&a, ptr, n); + return a; +} + +#ifdef __cplusplus +extern "C" { +#endif +extern const u8 simd_onebit_masks[]; +#ifdef __cplusplus +} +#endif + +static really_inline +m128 mask1bit128(unsigned int n) { + assert(n < sizeof(m128) * 8); + u32 mask_idx = ((n % 8) * 64) + 95; + mask_idx -= n / 8; + return loadu128(&simd_onebit_masks[mask_idx]); +} + +// switches on bit N in the given vector. +static really_inline +void setbit128(m128 *ptr, unsigned int n) { + *ptr = or128(mask1bit128(n), *ptr); +} + +// switches off bit N in the given vector. +static really_inline +void clearbit128(m128 *ptr, unsigned int n) { + *ptr = andnot128(mask1bit128(n), *ptr); +} + +// tests bit N in the given vector. +static really_inline +char testbit128(m128 val, unsigned int n) { + const m128 mask = mask1bit128(n); +#if defined(HAVE_SSE41) + return !_mm_testz_si128(mask, val); +#else + return isnonzero128(and128(mask, val)); +#endif +} + +// offset must be an immediate +#define palignr(r, l, offset) _mm_alignr_epi8(r, l, offset) + +static really_inline +m128 pshufb_m128(m128 a, m128 b) { + m128 result; + result = _mm_shuffle_epi8(a, b); + return result; +} + +static really_inline +m256 pshufb_m256(m256 a, m256 b) { +#if defined(HAVE_AVX2) + return _mm256_shuffle_epi8(a, b); +#else + m256 rv; + rv.lo = pshufb_m128(a.lo, b.lo); + rv.hi = pshufb_m128(a.hi, b.hi); + return rv; +#endif +} + +#if defined(HAVE_AVX512) +static really_inline +m512 pshufb_m512(m512 a, m512 b) { + return _mm512_shuffle_epi8(a, b); +} + +static really_inline +m512 maskz_pshufb_m512(__mmask64 k, m512 a, m512 b) { + return _mm512_maskz_shuffle_epi8(k, a, b); +} + +#if defined(HAVE_AVX512VBMI) +#define vpermb512(idx, a) _mm512_permutexvar_epi8(idx, a) +#define maskz_vpermb512(k, idx, a) _mm512_maskz_permutexvar_epi8(k, idx, a) +#endif + +#endif + +static really_inline +m128 variable_byte_shift_m128(m128 in, s32 amount) { + assert(amount >= -16 && amount <= 16); + m128 shift_mask = loadu128(vbs_mask_data + 16 - amount); + return pshufb_m128(in, shift_mask); +} + +static really_inline +m128 max_u8_m128(m128 a, m128 b) { + return _mm_max_epu8(a, b); +} + +static really_inline +m128 min_u8_m128(m128 a, m128 b) { + return _mm_min_epu8(a, b); +} + +static really_inline +m128 sadd_u8_m128(m128 a, m128 b) { + return _mm_adds_epu8(a, b); +} + +static really_inline +m128 sub_u8_m128(m128 a, m128 b) { + return _mm_sub_epi8(a, b); +} + +static really_inline +m128 set64x2(u64a hi, u64a lo) { + return _mm_set_epi64x(hi, lo); +} + +/**** + **** 256-bit Primitives + ****/ + +#if defined(HAVE_AVX2) + +static really_really_inline +m256 lshift64_m256(m256 a, unsigned b) { +#if defined(HAVE__BUILTIN_CONSTANT_P) + if (__builtin_constant_p(b)) { + return _mm256_slli_epi64(a, b); + } +#endif + m128 x = _mm_cvtsi32_si128(b); + return _mm256_sll_epi64(a, x); +} + +#define rshift64_m256(a, b) _mm256_srli_epi64((a), (b)) + +static really_inline +m256 set32x8(u32 in) { + return _mm256_set1_epi8(in); +} + +#define eq256(a, b) _mm256_cmpeq_epi8((a), (b)) +#define movemask256(a) ((u32)_mm256_movemask_epi8((a))) + +static really_inline +m256 set2x128(m128 a) { + return _mm256_broadcastsi128_si256(a); +} + +#else + +static really_really_inline +m256 lshift64_m256(m256 a, int b) { + m256 rv = a; + rv.lo = lshift64_m128(rv.lo, b); + rv.hi = lshift64_m128(rv.hi, b); + return rv; +} + +static really_inline +m256 rshift64_m256(m256 a, int b) { + m256 rv = a; + rv.lo = rshift64_m128(rv.lo, b); + rv.hi = rshift64_m128(rv.hi, b); + return rv; +} +static really_inline +m256 set32x8(u32 in) { + m256 rv; + rv.lo = set16x8((u8) in); + rv.hi = rv.lo; + return rv; +} + +static really_inline +m256 eq256(m256 a, m256 b) { + m256 rv; + rv.lo = eq128(a.lo, b.lo); + rv.hi = eq128(a.hi, b.hi); + return rv; +} + +static really_inline +u32 movemask256(m256 a) { + u32 lo_mask = movemask128(a.lo); + u32 hi_mask = movemask128(a.hi); + return lo_mask | (hi_mask << 16); +} + +static really_inline +m256 set2x128(m128 a) { + m256 rv = {a, a}; + return rv; +} +#endif + +static really_inline m256 zeroes256(void) { +#if defined(HAVE_AVX2) + return _mm256_setzero_si256(); +#else + m256 rv = {zeroes128(), zeroes128()}; + return rv; +#endif +} + +static really_inline m256 ones256(void) { +#if defined(HAVE_AVX2) + m256 rv = _mm256_set1_epi8(0xFF); +#else + m256 rv = {ones128(), ones128()}; +#endif + return rv; +} + +#if defined(HAVE_AVX2) +static really_inline m256 and256(m256 a, m256 b) { + return _mm256_and_si256(a, b); +} +#else +static really_inline m256 and256(m256 a, m256 b) { + m256 rv; + rv.lo = and128(a.lo, b.lo); + rv.hi = and128(a.hi, b.hi); + return rv; +} +#endif + +#if defined(HAVE_AVX2) +static really_inline m256 or256(m256 a, m256 b) { + return _mm256_or_si256(a, b); +} +#else +static really_inline m256 or256(m256 a, m256 b) { + m256 rv; + rv.lo = or128(a.lo, b.lo); + rv.hi = or128(a.hi, b.hi); + return rv; +} +#endif + +#if defined(HAVE_AVX2) +static really_inline m256 xor256(m256 a, m256 b) { + return _mm256_xor_si256(a, b); +} +#else +static really_inline m256 xor256(m256 a, m256 b) { + m256 rv; + rv.lo = xor128(a.lo, b.lo); + rv.hi = xor128(a.hi, b.hi); + return rv; +} +#endif + +#if defined(HAVE_AVX2) +static really_inline m256 not256(m256 a) { + return _mm256_xor_si256(a, ones256()); +} +#else +static really_inline m256 not256(m256 a) { + m256 rv; + rv.lo = not128(a.lo); + rv.hi = not128(a.hi); + return rv; +} +#endif + +#if defined(HAVE_AVX2) +static really_inline m256 andnot256(m256 a, m256 b) { + return _mm256_andnot_si256(a, b); +} +#else +static really_inline m256 andnot256(m256 a, m256 b) { + m256 rv; + rv.lo = andnot128(a.lo, b.lo); + rv.hi = andnot128(a.hi, b.hi); + return rv; +} +#endif + +static really_inline int diff256(m256 a, m256 b) { +#if defined(HAVE_AVX2) + return !!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(a, b)) ^ (int)-1); +#else + return diff128(a.lo, b.lo) || diff128(a.hi, b.hi); +#endif +} + +static really_inline int isnonzero256(m256 a) { +#if defined(HAVE_AVX2) + return !!diff256(a, zeroes256()); +#else + return isnonzero128(or128(a.lo, a.hi)); +#endif +} + +/** + * "Rich" version of diff256(). Takes two vectors a and b and returns an 8-bit + * mask indicating which 32-bit words contain differences. + */ +static really_inline u32 diffrich256(m256 a, m256 b) { +#if defined(HAVE_AVX2) + a = _mm256_cmpeq_epi32(a, b); + return ~(_mm256_movemask_ps(_mm256_castsi256_ps(a))) & 0xFF; +#else + m128 z = zeroes128(); + a.lo = _mm_cmpeq_epi32(a.lo, b.lo); + a.hi = _mm_cmpeq_epi32(a.hi, b.hi); + m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo, a.hi), z); + return ~(_mm_movemask_epi8(packed)) & 0xff; +#endif +} + +/** + * "Rich" version of diff256(), 64-bit variant. Takes two vectors a and b and + * returns an 8-bit mask indicating which 64-bit words contain differences. + */ +static really_inline u32 diffrich64_256(m256 a, m256 b) { + u32 d = diffrich256(a, b); + return (d | (d >> 1)) & 0x55555555; +} + +// aligned load +static really_inline m256 load256(const void *ptr) { + assert(ISALIGNED_N(ptr, alignof(m256))); +#if defined(HAVE_AVX2) + return _mm256_load_si256((const m256 *)ptr); +#else + m256 rv = { load128(ptr), load128((const char *)ptr + 16) }; + return rv; +#endif +} + +// aligned load of 128-bit value to low and high part of 256-bit value +static really_inline m256 load2x128(const void *ptr) { +#if defined(HAVE_AVX2) + return set2x128(load128(ptr)); +#else + assert(ISALIGNED_N(ptr, alignof(m128))); + m256 rv; + rv.hi = rv.lo = load128(ptr); + return rv; +#endif +} + +static really_inline m256 loadu2x128(const void *ptr) { + return set2x128(loadu128(ptr)); +} + +// aligned store +static really_inline void store256(void *ptr, m256 a) { + assert(ISALIGNED_N(ptr, alignof(m256))); +#if defined(HAVE_AVX2) + _mm256_store_si256((m256 *)ptr, a); +#else + ptr = assume_aligned(ptr, 16); + *(m256 *)ptr = a; +#endif +} + +// unaligned load +static really_inline m256 loadu256(const void *ptr) { +#if defined(HAVE_AVX2) + return _mm256_loadu_si256((const m256 *)ptr); +#else + m256 rv = { loadu128(ptr), loadu128((const char *)ptr + 16) }; + return rv; +#endif +} + +// unaligned store +static really_inline void storeu256(void *ptr, m256 a) { +#if defined(HAVE_AVX2) + _mm256_storeu_si256((m256 *)ptr, a); +#else + storeu128(ptr, a.lo); + storeu128((char *)ptr + 16, a.hi); +#endif +} + +// packed unaligned store of first N bytes +static really_inline +void storebytes256(void *ptr, m256 a, unsigned int n) { + assert(n <= sizeof(a)); + memcpy(ptr, &a, n); +} + +// packed unaligned load of first N bytes, pad with zero +static really_inline +m256 loadbytes256(const void *ptr, unsigned int n) { + m256 a = zeroes256(); + assert(n <= sizeof(a)); + memcpy(&a, ptr, n); + return a; +} + +static really_inline +m256 mask1bit256(unsigned int n) { + assert(n < sizeof(m256) * 8); + u32 mask_idx = ((n % 8) * 64) + 95; + mask_idx -= n / 8; + return loadu256(&simd_onebit_masks[mask_idx]); +} + +static really_inline +m256 set64x4(u64a hi_1, u64a hi_0, u64a lo_1, u64a lo_0) { +#if defined(HAVE_AVX2) + return _mm256_set_epi64x(hi_1, hi_0, lo_1, lo_0); +#else + m256 rv; + rv.hi = set64x2(hi_1, hi_0); + rv.lo = set64x2(lo_1, lo_0); + return rv; +#endif +} + +#if !defined(HAVE_AVX2) +// switches on bit N in the given vector. +static really_inline +void setbit256(m256 *ptr, unsigned int n) { + assert(n < sizeof(*ptr) * 8); + m128 *sub; + if (n < 128) { + sub = &ptr->lo; + } else { + sub = &ptr->hi; + n -= 128; + } + setbit128(sub, n); +} + +// switches off bit N in the given vector. +static really_inline +void clearbit256(m256 *ptr, unsigned int n) { + assert(n < sizeof(*ptr) * 8); + m128 *sub; + if (n < 128) { + sub = &ptr->lo; + } else { + sub = &ptr->hi; + n -= 128; + } + clearbit128(sub, n); +} + +// tests bit N in the given vector. +static really_inline +char testbit256(m256 val, unsigned int n) { + assert(n < sizeof(val) * 8); + m128 sub; + if (n < 128) { + sub = val.lo; + } else { + sub = val.hi; + n -= 128; + } + return testbit128(sub, n); +} + +static really_really_inline +m128 movdq_hi(m256 x) { + return x.hi; +} + +static really_really_inline +m128 movdq_lo(m256 x) { + return x.lo; +} + +static really_inline +m256 combine2x128(m128 hi, m128 lo) { + m256 rv = {lo, hi}; + return rv; +} + +#else // AVX2 + +// switches on bit N in the given vector. +static really_inline +void setbit256(m256 *ptr, unsigned int n) { + *ptr = or256(mask1bit256(n), *ptr); +} + +static really_inline +void clearbit256(m256 *ptr, unsigned int n) { + *ptr = andnot256(mask1bit256(n), *ptr); +} + +// tests bit N in the given vector. +static really_inline +char testbit256(m256 val, unsigned int n) { + const m256 mask = mask1bit256(n); + return !_mm256_testz_si256(mask, val); +} + +static really_really_inline +m128 movdq_hi(m256 x) { + return _mm256_extracti128_si256(x, 1); +} + +static really_really_inline +m128 movdq_lo(m256 x) { + return _mm256_extracti128_si256(x, 0); +} + +#define cast256to128(a) _mm256_castsi256_si128(a) +#define cast128to256(a) _mm256_castsi128_si256(a) +#define swap128in256(a) _mm256_permute4x64_epi64(a, 0x4E) +#define insert128to256(a, b, imm) _mm256_inserti128_si256(a, b, imm) +#define rshift128_m256(a, count_immed) _mm256_srli_si256(a, count_immed) +#define lshift128_m256(a, count_immed) _mm256_slli_si256(a, count_immed) +#define extract64from256(a, imm) _mm_extract_epi64(_mm256_extracti128_si256(a, imm >> 1), imm % 2) +#define extract32from256(a, imm) _mm_extract_epi32(_mm256_extracti128_si256(a, imm >> 2), imm % 4) +#define extractlow64from256(a) _mm_cvtsi128_si64(cast256to128(a)) +#define extractlow32from256(a) movd(cast256to128(a)) +#define interleave256hi(a, b) _mm256_unpackhi_epi8(a, b) +#define interleave256lo(a, b) _mm256_unpacklo_epi8(a, b) +#define vpalignr(r, l, offset) _mm256_alignr_epi8(r, l, offset) + +static really_inline +m256 combine2x128(m128 hi, m128 lo) { +#if defined(_mm256_set_m128i) + return _mm256_set_m128i(hi, lo); +#else + return insert128to256(cast128to256(lo), hi, 1); +#endif +} +#endif //AVX2 + +#if defined(HAVE_AVX512) +#define extract128from512(a, imm) _mm512_extracti32x4_epi32(a, imm) +#define interleave512hi(a, b) _mm512_unpackhi_epi8(a, b) +#define interleave512lo(a, b) _mm512_unpacklo_epi8(a, b) +#define set2x256(a) _mm512_broadcast_i64x4(a) +#define mask_set2x256(src, k, a) _mm512_mask_broadcast_i64x4(src, k, a) +#define vpermq512(idx, a) _mm512_permutexvar_epi64(idx, a) +#endif + +/**** + **** 384-bit Primitives + ****/ + +static really_inline m384 and384(m384 a, m384 b) { + m384 rv; + rv.lo = and128(a.lo, b.lo); + rv.mid = and128(a.mid, b.mid); + rv.hi = and128(a.hi, b.hi); + return rv; +} + +static really_inline m384 or384(m384 a, m384 b) { + m384 rv; + rv.lo = or128(a.lo, b.lo); + rv.mid = or128(a.mid, b.mid); + rv.hi = or128(a.hi, b.hi); + return rv; +} + +static really_inline m384 xor384(m384 a, m384 b) { + m384 rv; + rv.lo = xor128(a.lo, b.lo); + rv.mid = xor128(a.mid, b.mid); + rv.hi = xor128(a.hi, b.hi); + return rv; +} +static really_inline m384 not384(m384 a) { + m384 rv; + rv.lo = not128(a.lo); + rv.mid = not128(a.mid); + rv.hi = not128(a.hi); + return rv; +} +static really_inline m384 andnot384(m384 a, m384 b) { + m384 rv; + rv.lo = andnot128(a.lo, b.lo); + rv.mid = andnot128(a.mid, b.mid); + rv.hi = andnot128(a.hi, b.hi); + return rv; +} + +static really_really_inline +m384 lshift64_m384(m384 a, unsigned b) { + m384 rv; + rv.lo = lshift64_m128(a.lo, b); + rv.mid = lshift64_m128(a.mid, b); + rv.hi = lshift64_m128(a.hi, b); + return rv; +} + +static really_inline m384 zeroes384(void) { + m384 rv = {zeroes128(), zeroes128(), zeroes128()}; + return rv; +} + +static really_inline m384 ones384(void) { + m384 rv = {ones128(), ones128(), ones128()}; + return rv; +} + +static really_inline int diff384(m384 a, m384 b) { + return diff128(a.lo, b.lo) || diff128(a.mid, b.mid) || diff128(a.hi, b.hi); +} + +static really_inline int isnonzero384(m384 a) { + return isnonzero128(or128(or128(a.lo, a.mid), a.hi)); +} + +/** + * "Rich" version of diff384(). Takes two vectors a and b and returns a 12-bit + * mask indicating which 32-bit words contain differences. + */ +static really_inline u32 diffrich384(m384 a, m384 b) { + m128 z = zeroes128(); + a.lo = _mm_cmpeq_epi32(a.lo, b.lo); + a.mid = _mm_cmpeq_epi32(a.mid, b.mid); + a.hi = _mm_cmpeq_epi32(a.hi, b.hi); + m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo, a.mid), + _mm_packs_epi32(a.hi, z)); + return ~(_mm_movemask_epi8(packed)) & 0xfff; +} + +/** + * "Rich" version of diff384(), 64-bit variant. Takes two vectors a and b and + * returns a 12-bit mask indicating which 64-bit words contain differences. + */ +static really_inline u32 diffrich64_384(m384 a, m384 b) { + u32 d = diffrich384(a, b); + return (d | (d >> 1)) & 0x55555555; +} + +// aligned load +static really_inline m384 load384(const void *ptr) { + assert(ISALIGNED_16(ptr)); + m384 rv = { load128(ptr), load128((const char *)ptr + 16), + load128((const char *)ptr + 32) }; + return rv; +} + +// aligned store +static really_inline void store384(void *ptr, m384 a) { + assert(ISALIGNED_16(ptr)); + ptr = assume_aligned(ptr, 16); + *(m384 *)ptr = a; +} + +// unaligned load +static really_inline m384 loadu384(const void *ptr) { + m384 rv = { loadu128(ptr), loadu128((const char *)ptr + 16), + loadu128((const char *)ptr + 32)}; + return rv; +} + +// packed unaligned store of first N bytes +static really_inline +void storebytes384(void *ptr, m384 a, unsigned int n) { + assert(n <= sizeof(a)); + memcpy(ptr, &a, n); +} + +// packed unaligned load of first N bytes, pad with zero +static really_inline +m384 loadbytes384(const void *ptr, unsigned int n) { + m384 a = zeroes384(); + assert(n <= sizeof(a)); + memcpy(&a, ptr, n); + return a; +} + +// switches on bit N in the given vector. +static really_inline +void setbit384(m384 *ptr, unsigned int n) { + assert(n < sizeof(*ptr) * 8); + m128 *sub; + if (n < 128) { + sub = &ptr->lo; + } else if (n < 256) { + sub = &ptr->mid; + } else { + sub = &ptr->hi; + } + setbit128(sub, n % 128); +} + +// switches off bit N in the given vector. +static really_inline +void clearbit384(m384 *ptr, unsigned int n) { + assert(n < sizeof(*ptr) * 8); + m128 *sub; + if (n < 128) { + sub = &ptr->lo; + } else if (n < 256) { + sub = &ptr->mid; + } else { + sub = &ptr->hi; + } + clearbit128(sub, n % 128); +} + +// tests bit N in the given vector. +static really_inline +char testbit384(m384 val, unsigned int n) { + assert(n < sizeof(val) * 8); + m128 sub; + if (n < 128) { + sub = val.lo; + } else if (n < 256) { + sub = val.mid; + } else { + sub = val.hi; + } + return testbit128(sub, n % 128); +} + +/**** + **** 512-bit Primitives + ****/ + +#define eq512mask(a, b) _mm512_cmpeq_epi8_mask((a), (b)) +#define masked_eq512mask(k, a, b) _mm512_mask_cmpeq_epi8_mask((k), (a), (b)) + +static really_inline +m512 zeroes512(void) { +#if defined(HAVE_AVX512) + return _mm512_setzero_si512(); +#else + m512 rv = {zeroes256(), zeroes256()}; + return rv; +#endif +} + +static really_inline +m512 ones512(void) { +#if defined(HAVE_AVX512) + return _mm512_set1_epi8(0xFF); + //return _mm512_xor_si512(_mm512_setzero_si512(), _mm512_setzero_si512()); +#else + m512 rv = {ones256(), ones256()}; + return rv; +#endif +} + +#if defined(HAVE_AVX512) +static really_inline +m512 set64x8(u8 a) { + return _mm512_set1_epi8(a); +} + +static really_inline +m512 set8x64(u64a a) { + return _mm512_set1_epi64(a); +} + +static really_inline +m512 set16x32(u32 a) { + return _mm512_set1_epi32(a); +} + +static really_inline +m512 set512_64(u64a hi_3, u64a hi_2, u64a hi_1, u64a hi_0, + u64a lo_3, u64a lo_2, u64a lo_1, u64a lo_0) { + return _mm512_set_epi64(hi_3, hi_2, hi_1, hi_0, + lo_3, lo_2, lo_1, lo_0); +} + +static really_inline +m512 swap256in512(m512 a) { + m512 idx = set512_64(3ULL, 2ULL, 1ULL, 0ULL, 7ULL, 6ULL, 5ULL, 4ULL); + return vpermq512(idx, a); +} + +static really_inline +m512 set4x128(m128 a) { + return _mm512_broadcast_i32x4(a); +} + +static really_inline +m512 sadd_u8_m512(m512 a, m512 b) { + return _mm512_adds_epu8(a, b); +} + +static really_inline +m512 max_u8_m512(m512 a, m512 b) { + return _mm512_max_epu8(a, b); +} + +static really_inline +m512 min_u8_m512(m512 a, m512 b) { + return _mm512_min_epu8(a, b); +} + +static really_inline +m512 sub_u8_m512(m512 a, m512 b) { + return _mm512_sub_epi8(a, b); +} +#endif + +static really_inline +m512 and512(m512 a, m512 b) { +#if defined(HAVE_AVX512) + return _mm512_and_si512(a, b); +#else + m512 rv; + rv.lo = and256(a.lo, b.lo); + rv.hi = and256(a.hi, b.hi); + return rv; +#endif +} + +static really_inline +m512 or512(m512 a, m512 b) { +#if defined(HAVE_AVX512) + return _mm512_or_si512(a, b); +#else + m512 rv; + rv.lo = or256(a.lo, b.lo); + rv.hi = or256(a.hi, b.hi); + return rv; +#endif +} + +static really_inline +m512 xor512(m512 a, m512 b) { +#if defined(HAVE_AVX512) + return _mm512_xor_si512(a, b); +#else + m512 rv; + rv.lo = xor256(a.lo, b.lo); + rv.hi = xor256(a.hi, b.hi); + return rv; +#endif +} + +static really_inline +m512 not512(m512 a) { +#if defined(HAVE_AVX512) + return _mm512_xor_si512(a, ones512()); +#else + m512 rv; + rv.lo = not256(a.lo); + rv.hi = not256(a.hi); + return rv; +#endif +} + +static really_inline +m512 andnot512(m512 a, m512 b) { +#if defined(HAVE_AVX512) + return _mm512_andnot_si512(a, b); +#else + m512 rv; + rv.lo = andnot256(a.lo, b.lo); + rv.hi = andnot256(a.hi, b.hi); + return rv; +#endif +} + +#if defined(HAVE_AVX512) +static really_really_inline +m512 lshift64_m512(m512 a, unsigned b) { +#if defined(HAVE__BUILTIN_CONSTANT_P) + if (__builtin_constant_p(b)) { + return _mm512_slli_epi64(a, b); + } +#endif + m128 x = _mm_cvtsi32_si128(b); + return _mm512_sll_epi64(a, x); +} +#else +static really_really_inline +m512 lshift64_m512(m512 a, unsigned b) { + m512 rv; + rv.lo = lshift64_m256(a.lo, b); + rv.hi = lshift64_m256(a.hi, b); + return rv; +} +#endif + +#if defined(HAVE_AVX512) +#define rshift64_m512(a, b) _mm512_srli_epi64((a), (b)) +#define rshift128_m512(a, count_immed) _mm512_bsrli_epi128(a, count_immed) +#define lshift128_m512(a, count_immed) _mm512_bslli_epi128(a, count_immed) +#endif + +#if !defined(_MM_CMPINT_NE) +#define _MM_CMPINT_NE 0x4 +#endif + +static really_inline +int diff512(m512 a, m512 b) { +#if defined(HAVE_AVX512) + return !!_mm512_cmp_epi8_mask(a, b, _MM_CMPINT_NE); +#else + return diff256(a.lo, b.lo) || diff256(a.hi, b.hi); +#endif +} + +static really_inline +int isnonzero512(m512 a) { +#if defined(HAVE_AVX512) + return diff512(a, zeroes512()); +#elif defined(HAVE_AVX2) + m256 x = or256(a.lo, a.hi); + return !!diff256(x, zeroes256()); +#else + m128 x = or128(a.lo.lo, a.lo.hi); + m128 y = or128(a.hi.lo, a.hi.hi); + return isnonzero128(or128(x, y)); +#endif +} + +/** + * "Rich" version of diff512(). Takes two vectors a and b and returns a 16-bit + * mask indicating which 32-bit words contain differences. + */ +static really_inline +u32 diffrich512(m512 a, m512 b) { +#if defined(HAVE_AVX512) + return _mm512_cmp_epi32_mask(a, b, _MM_CMPINT_NE); +#elif defined(HAVE_AVX2) + return diffrich256(a.lo, b.lo) | (diffrich256(a.hi, b.hi) << 8); +#else + a.lo.lo = _mm_cmpeq_epi32(a.lo.lo, b.lo.lo); + a.lo.hi = _mm_cmpeq_epi32(a.lo.hi, b.lo.hi); + a.hi.lo = _mm_cmpeq_epi32(a.hi.lo, b.hi.lo); + a.hi.hi = _mm_cmpeq_epi32(a.hi.hi, b.hi.hi); + m128 packed = _mm_packs_epi16(_mm_packs_epi32(a.lo.lo, a.lo.hi), + _mm_packs_epi32(a.hi.lo, a.hi.hi)); + return ~(_mm_movemask_epi8(packed)) & 0xffff; +#endif +} + +/** + * "Rich" version of diffrich(), 64-bit variant. Takes two vectors a and b and + * returns a 16-bit mask indicating which 64-bit words contain differences. + */ +static really_inline +u32 diffrich64_512(m512 a, m512 b) { + //TODO: cmp_epi64? + u32 d = diffrich512(a, b); + return (d | (d >> 1)) & 0x55555555; +} + +// aligned load +static really_inline +m512 load512(const void *ptr) { +#if defined(HAVE_AVX512) + return _mm512_load_si512(ptr); +#else + assert(ISALIGNED_N(ptr, alignof(m256))); + m512 rv = { load256(ptr), load256((const char *)ptr + 32) }; + return rv; +#endif +} + +// aligned store +static really_inline +void store512(void *ptr, m512 a) { + assert(ISALIGNED_N(ptr, alignof(m512))); +#if defined(HAVE_AVX512) + return _mm512_store_si512(ptr, a); +#elif defined(HAVE_AVX2) + m512 *x = (m512 *)ptr; + store256(&x->lo, a.lo); + store256(&x->hi, a.hi); +#else + ptr = assume_aligned(ptr, 16); + *(m512 *)ptr = a; +#endif +} + +// unaligned load +static really_inline +m512 loadu512(const void *ptr) { +#if defined(HAVE_AVX512) + return _mm512_loadu_si512(ptr); +#else + m512 rv = { loadu256(ptr), loadu256((const char *)ptr + 32) }; + return rv; +#endif +} + +// unaligned store +static really_inline +void storeu512(void *ptr, m512 a) { +#if defined(HAVE_AVX512) + _mm512_storeu_si512((m512 *)ptr, a); +#elif defined(HAVE_AVX2) + storeu256(ptr, a.lo); + storeu256((char *)ptr + 32, a.hi); +#else + storeu128(ptr, a.lo.lo); + storeu128((char *)ptr + 16, a.lo.hi); + storeu128((char *)ptr + 32, a.hi.lo); + storeu128((char *)ptr + 48, a.hi.hi); +#endif +} + +#if defined(HAVE_AVX512) +static really_inline +m512 loadu_maskz_m512(__mmask64 k, const void *ptr) { + return _mm512_maskz_loadu_epi8(k, ptr); +} + +static really_inline +m512 loadu_mask_m512(m512 src, __mmask64 k, const void *ptr) { + return _mm512_mask_loadu_epi8(src, k, ptr); +} + +static really_inline +void storeu_mask_m512(void *ptr, __mmask64 k, m512 a) { + _mm512_mask_storeu_epi8(ptr, k, a); +} + +static really_inline +m512 set_mask_m512(__mmask64 k) { + return _mm512_movm_epi8(k); +} + +static really_inline +m256 loadu_maskz_m256(__mmask32 k, const void *ptr) { + return _mm256_maskz_loadu_epi8(k, ptr); +} +#endif + +// packed unaligned store of first N bytes +static really_inline +void storebytes512(void *ptr, m512 a, unsigned int n) { + assert(n <= sizeof(a)); + memcpy(ptr, &a, n); +} + +// packed unaligned load of first N bytes, pad with zero +static really_inline +m512 loadbytes512(const void *ptr, unsigned int n) { + m512 a = zeroes512(); + assert(n <= sizeof(a)); + memcpy(&a, ptr, n); + return a; +} + +static really_inline +m512 mask1bit512(unsigned int n) { + assert(n < sizeof(m512) * 8); + u32 mask_idx = ((n % 8) * 64) + 95; + mask_idx -= n / 8; + return loadu512(&simd_onebit_masks[mask_idx]); +} + +// switches on bit N in the given vector. +static really_inline +void setbit512(m512 *ptr, unsigned int n) { + assert(n < sizeof(*ptr) * 8); +#if !defined(HAVE_AVX2) + m128 *sub; + if (n < 128) { + sub = &ptr->lo.lo; + } else if (n < 256) { + sub = &ptr->lo.hi; + } else if (n < 384) { + sub = &ptr->hi.lo; + } else { + sub = &ptr->hi.hi; + } + setbit128(sub, n % 128); +#elif defined(HAVE_AVX512) + *ptr = or512(mask1bit512(n), *ptr); +#else + m256 *sub; + if (n < 256) { + sub = &ptr->lo; + } else { + sub = &ptr->hi; + n -= 256; + } + setbit256(sub, n); +#endif +} + +// switches off bit N in the given vector. +static really_inline +void clearbit512(m512 *ptr, unsigned int n) { + assert(n < sizeof(*ptr) * 8); +#if !defined(HAVE_AVX2) + m128 *sub; + if (n < 128) { + sub = &ptr->lo.lo; + } else if (n < 256) { + sub = &ptr->lo.hi; + } else if (n < 384) { + sub = &ptr->hi.lo; + } else { + sub = &ptr->hi.hi; + } + clearbit128(sub, n % 128); +#elif defined(HAVE_AVX512) + *ptr = andnot512(mask1bit512(n), *ptr); +#else + m256 *sub; + if (n < 256) { + sub = &ptr->lo; + } else { + sub = &ptr->hi; + n -= 256; + } + clearbit256(sub, n); +#endif +} + +// tests bit N in the given vector. +static really_inline +char testbit512(m512 val, unsigned int n) { + assert(n < sizeof(val) * 8); +#if !defined(HAVE_AVX2) + m128 sub; + if (n < 128) { + sub = val.lo.lo; + } else if (n < 256) { + sub = val.lo.hi; + } else if (n < 384) { + sub = val.hi.lo; + } else { + sub = val.hi.hi; + } + return testbit128(sub, n % 128); +#elif defined(HAVE_AVX512) + const m512 mask = mask1bit512(n); + return !!_mm512_test_epi8_mask(mask, val); +#else + m256 sub; + if (n < 256) { + sub = val.lo; + } else { + sub = val.hi; + n -= 256; + } + return testbit256(sub, n); +#endif +} + +#endif diff --git a/regex/util/state_compress.c b/regex/util/state_compress.c new file mode 100644 index 000000000..e29d5935d --- /dev/null +++ b/regex/util/state_compress.c @@ -0,0 +1,617 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Mask-based state compression, used by the NFA. + */ +#include "config.h" +#include "ue2common.h" +#include "arch.h" +#include "bitutils.h" +#include "unaligned.h" +#include "pack_bits.h" +#include "partial_store.h" +#include "popcount.h" +#include "state_compress.h" + +#ifndef __KERNEL__ +#include +#else +#include +#endif + +/* + * 32-bit store/load. + */ + +void storecompressed32(void *ptr, const u32 *x, const u32 *m, u32 bytes) { + assert(popcount32(*m) <= bytes * 8); + + u32 v = compress32(*x, *m); + partial_store_u32(ptr, v, bytes); +} + +void loadcompressed32(u32 *x, const void *ptr, const u32 *m, u32 bytes) { + assert(popcount32(*m) <= bytes * 8); + + u32 v = partial_load_u32(ptr, bytes); + *x = expand32(v, *m); +} + +/* + * 64-bit store/load. + */ + +void storecompressed64(void *ptr, const u64a *x, const u64a *m, u32 bytes) { + assert(popcount64(*m) <= bytes * 8); + + u64a v = compress64(*x, *m); + partial_store_u64a(ptr, v, bytes); +} + +void loadcompressed64(u64a *x, const void *ptr, const u64a *m, u32 bytes) { + assert(popcount64(*m) <= bytes * 8); + + u64a v = partial_load_u64a(ptr, bytes); + *x = expand64(v, *m); +} + +/* + * 128-bit store/load. + */ + +#if defined(ARCH_32_BIT) +static really_inline +void storecompressed128_32bit(void *ptr, m128 xvec, m128 mvec) { + // First, decompose our vectors into 32-bit chunks. + u32 x[4]; + memcpy(x, &xvec, sizeof(xvec)); + u32 m[4]; + memcpy(m, &mvec, sizeof(mvec)); + + // Count the number of bits of compressed state we're writing out per + // chunk. + u32 bits[4] = { popcount32(m[0]), popcount32(m[1]), + popcount32(m[2]), popcount32(m[3]) }; + + // Compress each 32-bit chunk individually. + u32 v[4] = { compress32(x[0], m[0]), compress32(x[1], m[1]), + compress32(x[2], m[2]), compress32(x[3], m[3]) }; + + // Write packed data out. + pack_bits_32(ptr, v, bits, 4); +} +#endif + +#if defined(ARCH_64_BIT) +static really_inline +void storecompressed128_64bit(void *ptr, m128 xvec, m128 mvec) { + // First, decompose our vectors into 64-bit chunks. + u64a x[2]; + memcpy(x, &xvec, sizeof(xvec)); + u64a m[2]; + memcpy(m, &mvec, sizeof(mvec)); + + // Count the number of bits of compressed state we're writing out per + // chunk. + u32 bits[2] = { popcount64(m[0]), popcount64(m[1]) }; + + // Compress each 64-bit chunk individually. + u64a v[2] = { compress64(x[0], m[0]), compress64(x[1], m[1]) }; + + // Write packed data out. + pack_bits_64(ptr, v, bits, 2); +} +#endif + +void storecompressed128(void *ptr, const m128 *x, const m128 *m, + UNUSED u32 bytes) { +#if defined(ARCH_64_BIT) + storecompressed128_64bit(ptr, *x, *m); +#else + storecompressed128_32bit(ptr, *x, *m); +#endif +} + +#if defined(ARCH_32_BIT) +static really_inline +m128 loadcompressed128_32bit(const void *ptr, m128 mvec) { + // First, decompose our vectors into 32-bit chunks. + u32 m[8]; + memcpy(m, &mvec, sizeof(mvec)); + + u32 bits[4] = { popcount32(m[0]), popcount32(m[1]), + popcount32(m[2]), popcount32(m[3]) }; + u32 v[4]; + + unpack_bits_32(v, (const u8 *)ptr, bits, 4); + + u32 x[4] = { expand32(v[0], m[0]), expand32(v[1], m[1]), + expand32(v[2], m[2]), expand32(v[3], m[3]) }; + + return _mm_set_epi32(x[3], x[2], x[1], x[0]); +} +#endif + +#if defined(ARCH_64_BIT) +static really_inline +m128 loadcompressed128_64bit(const void *ptr, m128 mvec) { + // First, decompose our vectors into 64-bit chunks. + u64a m[2] = { movq(mvec), movq(_mm_srli_si128(mvec, 8)) }; + + u32 bits[2] = { popcount64(m[0]), popcount64(m[1]) }; + u64a v[2]; + + unpack_bits_64(v, (const u8 *)ptr, bits, 2); + + u64a x[2] = { expand64(v[0], m[0]), expand64(v[1], m[1]) }; + + return _mm_set_epi64x(x[1], x[0]); +} +#endif + +void loadcompressed128(m128 *x, const void *ptr, const m128 *m, + UNUSED u32 bytes) { +#if defined(ARCH_64_BIT) + *x = loadcompressed128_64bit(ptr, *m); +#else + *x = loadcompressed128_32bit(ptr, *m); +#endif +} + +/* + * 256-bit store/load. + */ + +#if defined(ARCH_32_BIT) +static really_inline +void storecompressed256_32bit(void *ptr, m256 xvec, m256 mvec) { + // First, decompose our vectors into 32-bit chunks. + u32 x[8]; + memcpy(x, &xvec, sizeof(xvec)); + u32 m[8]; + memcpy(m, &mvec, sizeof(mvec)); + + // Count the number of bits of compressed state we're writing out per + // chunk. + u32 bits[8] = { popcount32(m[0]), popcount32(m[1]), + popcount32(m[2]), popcount32(m[3]), + popcount32(m[4]), popcount32(m[5]), + popcount32(m[6]), popcount32(m[7])}; + + // Compress each 32-bit chunk individually. + u32 v[8] = { compress32(x[0], m[0]), compress32(x[1], m[1]), + compress32(x[2], m[2]), compress32(x[3], m[3]), + compress32(x[4], m[4]), compress32(x[5], m[5]), + compress32(x[6], m[6]), compress32(x[7], m[7]) }; + + // Write packed data out. + pack_bits_32(ptr, v, bits, 8); +} +#endif + +#if defined(ARCH_64_BIT) +static really_really_inline +void storecompressed256_64bit(void *ptr, m256 xvec, m256 mvec) { + // First, decompose our vectors into 64-bit chunks. + u64a x[4]; + memcpy(x, &xvec, sizeof(xvec)); + u64a m[4]; + memcpy(m, &mvec, sizeof(mvec)); + + // Count the number of bits of compressed state we're writing out per + // chunk. + u32 bits[4] = { popcount64(m[0]), popcount64(m[1]), + popcount64(m[2]), popcount64(m[3]) }; + + // Compress each 64-bit chunk individually. + u64a v[4] = { compress64(x[0], m[0]), compress64(x[1], m[1]), + compress64(x[2], m[2]), compress64(x[3], m[3]) }; + + // Write packed data out. + pack_bits_64(ptr, v, bits, 4); +} +#endif + +void storecompressed256(void *ptr, const m256 *x, const m256 *m, + UNUSED u32 bytes) { +#if defined(ARCH_64_BIT) + storecompressed256_64bit(ptr, *x, *m); +#else + storecompressed256_32bit(ptr, *x, *m); +#endif +} + +#if defined(ARCH_32_BIT) +static really_inline +m256 loadcompressed256_32bit(const void *ptr, m256 mvec) { + // First, decompose our vectors into 32-bit chunks. + u32 m[8]; + memcpy(m, &mvec, sizeof(mvec)); + + u32 bits[8] = { popcount32(m[0]), popcount32(m[1]), + popcount32(m[2]), popcount32(m[3]), + popcount32(m[4]), popcount32(m[5]), + popcount32(m[6]), popcount32(m[7])}; + u32 v[8]; + + unpack_bits_32(v, (const u8 *)ptr, bits, 8); + + u32 x[8] = { expand32(v[0], m[0]), expand32(v[1], m[1]), + expand32(v[2], m[2]), expand32(v[3], m[3]), + expand32(v[4], m[4]), expand32(v[5], m[5]), + expand32(v[6], m[6]), expand32(v[7], m[7]) }; + +#if !defined(HAVE_AVX2) + m256 xvec = { .lo = _mm_set_epi32(x[3], x[2], x[1], x[0]), + .hi = _mm_set_epi32(x[7], x[6], x[5], x[4]) }; +#else + m256 xvec = _mm256_set_epi32(x[7], x[6], x[5], x[4], + x[3], x[2], x[1], x[0]); +#endif + return xvec; +} +#endif + +#if defined(ARCH_64_BIT) +static really_inline +m256 loadcompressed256_64bit(const void *ptr, m256 mvec) { + // First, decompose our vectors into 64-bit chunks. + u64a m[4]; + memcpy(m, &mvec, sizeof(mvec)); + + u32 bits[4] = { popcount64(m[0]), popcount64(m[1]), + popcount64(m[2]), popcount64(m[3]) }; + u64a v[4]; + + unpack_bits_64(v, (const u8 *)ptr, bits, 4); + + u64a x[4] = { expand64(v[0], m[0]), expand64(v[1], m[1]), + expand64(v[2], m[2]), expand64(v[3], m[3]) }; + +#if !defined(HAVE_AVX2) + m256 xvec = { .lo = _mm_set_epi64x(x[1], x[0]), + .hi = _mm_set_epi64x(x[3], x[2]) }; +#else + m256 xvec = _mm256_set_epi64x(x[3], x[2], x[1], x[0]); +#endif + return xvec; +} +#endif + +void loadcompressed256(m256 *x, const void *ptr, const m256 *m, + UNUSED u32 bytes) { +#if defined(ARCH_64_BIT) + *x = loadcompressed256_64bit(ptr, *m); +#else + *x = loadcompressed256_32bit(ptr, *m); +#endif +} + +/* + * 384-bit store/load. + */ + +#if defined(ARCH_32_BIT) +static really_inline +void storecompressed384_32bit(void *ptr, m384 xvec, m384 mvec) { + // First, decompose our vectors into 32-bit chunks. + u32 x[12]; + memcpy(x, &xvec, sizeof(xvec)); + u32 m[12]; + memcpy(m, &mvec, sizeof(mvec)); + + // Count the number of bits of compressed state we're writing out per + // chunk. + u32 bits[12] = { popcount32(m[0]), popcount32(m[1]), + popcount32(m[2]), popcount32(m[3]), + popcount32(m[4]), popcount32(m[5]), + popcount32(m[6]), popcount32(m[7]), + popcount32(m[8]), popcount32(m[9]), + popcount32(m[10]), popcount32(m[11]) }; + + // Compress each 32-bit chunk individually. + u32 v[12] = { compress32(x[0], m[0]), compress32(x[1], m[1]), + compress32(x[2], m[2]), compress32(x[3], m[3]), + compress32(x[4], m[4]), compress32(x[5], m[5]), + compress32(x[6], m[6]), compress32(x[7], m[7]), + compress32(x[8], m[8]), compress32(x[9], m[9]), + compress32(x[10], m[10]), compress32(x[11], m[11])}; + + // Write packed data out. + pack_bits_32(ptr, v, bits, 12); +} +#endif + +#if defined(ARCH_64_BIT) +static really_inline +void storecompressed384_64bit(void *ptr, m384 xvec, m384 mvec) { + // First, decompose our vectors into 64-bit chunks. + u64a x[6]; + memcpy(x, &xvec, sizeof(xvec)); + u64a m[6]; + memcpy(m, &mvec, sizeof(mvec)); + + // Count the number of bits of compressed state we're writing out per + // chunk. + u32 bits[6] = { popcount64(m[0]), popcount64(m[1]), + popcount64(m[2]), popcount64(m[3]), + popcount64(m[4]), popcount64(m[5]) }; + + // Compress each 64-bit chunk individually. + u64a v[6] = { compress64(x[0], m[0]), compress64(x[1], m[1]), + compress64(x[2], m[2]), compress64(x[3], m[3]), + compress64(x[4], m[4]), compress64(x[5], m[5]) }; + + // Write packed data out. + pack_bits_64(ptr, v, bits, 6); +} +#endif + +void storecompressed384(void *ptr, const m384 *x, const m384 *m, + UNUSED u32 bytes) { +#if defined(ARCH_64_BIT) + storecompressed384_64bit(ptr, *x, *m); +#else + storecompressed384_32bit(ptr, *x, *m); +#endif +} + +#if defined(ARCH_32_BIT) +static really_inline +m384 loadcompressed384_32bit(const void *ptr, m384 mvec) { + // First, decompose our vectors into 32-bit chunks. + u32 m[12]; + memcpy(m, &mvec, sizeof(mvec)); + + u32 bits[12] = { popcount32(m[0]), popcount32(m[1]), + popcount32(m[2]), popcount32(m[3]), + popcount32(m[4]), popcount32(m[5]), + popcount32(m[6]), popcount32(m[7]), + popcount32(m[8]), popcount32(m[9]), + popcount32(m[10]), popcount32(m[11]) }; + u32 v[12]; + + unpack_bits_32(v, (const u8 *)ptr, bits, 12); + + u32 x[12] = { expand32(v[0], m[0]), expand32(v[1], m[1]), + expand32(v[2], m[2]), expand32(v[3], m[3]), + expand32(v[4], m[4]), expand32(v[5], m[5]), + expand32(v[6], m[6]), expand32(v[7], m[7]), + expand32(v[8], m[8]), expand32(v[9], m[9]), + expand32(v[10], m[10]), expand32(v[11], m[11]) }; + + m384 xvec = { .lo = _mm_set_epi32(x[3], x[2], x[1], x[0]), + .mid = _mm_set_epi32(x[7], x[6], x[5], x[4]), + .hi = _mm_set_epi32(x[11], x[10], x[9], x[8]) }; + return xvec; +} +#endif + +#if defined(ARCH_64_BIT) +static really_inline +m384 loadcompressed384_64bit(const void *ptr, m384 mvec) { + // First, decompose our vectors into 64-bit chunks. + u64a m[6]; + memcpy(m, &mvec, sizeof(mvec)); + + u32 bits[6] = { popcount64(m[0]), popcount64(m[1]), + popcount64(m[2]), popcount64(m[3]), + popcount64(m[4]), popcount64(m[5]) }; + u64a v[6]; + + unpack_bits_64(v, (const u8 *)ptr, bits, 6); + + u64a x[6] = { expand64(v[0], m[0]), expand64(v[1], m[1]), + expand64(v[2], m[2]), expand64(v[3], m[3]), + expand64(v[4], m[4]), expand64(v[5], m[5]) }; + + m384 xvec = { .lo = _mm_set_epi64x(x[1], x[0]), + .mid = _mm_set_epi64x(x[3], x[2]), + .hi = _mm_set_epi64x(x[5], x[4]) }; + return xvec; +} +#endif + +void loadcompressed384(m384 *x, const void *ptr, const m384 *m, + UNUSED u32 bytes) { +#if defined(ARCH_64_BIT) + *x = loadcompressed384_64bit(ptr, *m); +#else + *x = loadcompressed384_32bit(ptr, *m); +#endif +} + +/* + * 512-bit store/load. + */ + +#if defined(ARCH_32_BIT) +static really_inline +void storecompressed512_32bit(void *ptr, m512 xvec, m512 mvec) { + // First, decompose our vectors into 32-bit chunks. + u32 x[16]; + memcpy(x, &xvec, sizeof(xvec)); + u32 m[16]; + memcpy(m, &mvec, sizeof(mvec)); + + // Count the number of bits of compressed state we're writing out per + // chunk. + u32 bits[16] = { popcount32(m[0]), popcount32(m[1]), + popcount32(m[2]), popcount32(m[3]), + popcount32(m[4]), popcount32(m[5]), + popcount32(m[6]), popcount32(m[7]), + popcount32(m[8]), popcount32(m[9]), + popcount32(m[10]), popcount32(m[11]), + popcount32(m[12]), popcount32(m[13]), + popcount32(m[14]), popcount32(m[15])}; + + // Compress each 32-bit chunk individually. + u32 v[16] = { compress32(x[0], m[0]), compress32(x[1], m[1]), + compress32(x[2], m[2]), compress32(x[3], m[3]), + compress32(x[4], m[4]), compress32(x[5], m[5]), + compress32(x[6], m[6]), compress32(x[7], m[7]), + compress32(x[8], m[8]), compress32(x[9], m[9]), + compress32(x[10], m[10]), compress32(x[11], m[11]), + compress32(x[12], m[12]), compress32(x[13], m[13]), + compress32(x[14], m[14]), compress32(x[15], m[15]) }; + + // Write packed data out. + pack_bits_32(ptr, v, bits, 16); +} +#endif + +#if defined(ARCH_64_BIT) +static really_inline +void storecompressed512_64bit(void *ptr, m512 xvec, m512 mvec) { + // First, decompose our vectors into 64-bit chunks. + u64a m[8]; + memcpy(m, &mvec, sizeof(mvec)); + u64a x[8]; + memcpy(x, &xvec, sizeof(xvec)); + + // Count the number of bits of compressed state we're writing out per + // chunk. + u32 bits[8] = { popcount64(m[0]), popcount64(m[1]), + popcount64(m[2]), popcount64(m[3]), + popcount64(m[4]), popcount64(m[5]), + popcount64(m[6]), popcount64(m[7]) }; + + // Compress each 64-bit chunk individually. + u64a v[8] = { compress64(x[0], m[0]), compress64(x[1], m[1]), + compress64(x[2], m[2]), compress64(x[3], m[3]), + compress64(x[4], m[4]), compress64(x[5], m[5]), + compress64(x[6], m[6]), compress64(x[7], m[7]) }; + + // Write packed data out. + pack_bits_64(ptr, v, bits, 8); +} +#endif + +void storecompressed512(void *ptr, const m512 *x, const m512 *m, + UNUSED u32 bytes) { +#if defined(ARCH_64_BIT) + storecompressed512_64bit(ptr, *x, *m); +#else + storecompressed512_32bit(ptr, *x, *m); +#endif +} + +#if defined(ARCH_32_BIT) +static really_inline +m512 loadcompressed512_32bit(const void *ptr, m512 mvec) { + // First, decompose our vectors into 32-bit chunks. + u32 m[16]; + memcpy(m, &mvec, sizeof(mvec)); + + u32 bits[16] = { popcount32(m[0]), popcount32(m[1]), + popcount32(m[2]), popcount32(m[3]), + popcount32(m[4]), popcount32(m[5]), + popcount32(m[6]), popcount32(m[7]), + popcount32(m[8]), popcount32(m[9]), + popcount32(m[10]), popcount32(m[11]), + popcount32(m[12]), popcount32(m[13]), + popcount32(m[14]), popcount32(m[15]) }; + u32 v[16]; + + unpack_bits_32(v, (const u8 *)ptr, bits, 16); + + u32 x[16] = { expand32(v[0], m[0]), expand32(v[1], m[1]), + expand32(v[2], m[2]), expand32(v[3], m[3]), + expand32(v[4], m[4]), expand32(v[5], m[5]), + expand32(v[6], m[6]), expand32(v[7], m[7]), + expand32(v[8], m[8]), expand32(v[9], m[9]), + expand32(v[10], m[10]), expand32(v[11], m[11]), + expand32(v[12], m[12]), expand32(v[13], m[13]), + expand32(v[14], m[14]), expand32(v[15], m[15]) }; + + m512 xvec; +#if defined(HAVE_AVX512) + xvec = _mm512_set_epi32(x[15], x[14], x[13], x[12], + x[11], x[10], x[9], x[8], + x[7], x[6], x[5], x[4], + x[3], x[2], x[1], x[0]); +#elif defined(HAVE_AVX2) + xvec.lo = _mm256_set_epi32(x[7], x[6], x[5], x[4], + x[3], x[2], x[1], x[0]); + xvec.hi = _mm256_set_epi32(x[15], x[14], x[13], x[12], + x[11], x[10], x[9], x[8]); +#else + xvec.lo.lo = _mm_set_epi32(x[3], x[2], x[1], x[0]); + xvec.lo.hi = _mm_set_epi32(x[7], x[6], x[5], x[4]); + xvec.hi.lo = _mm_set_epi32(x[11], x[10], x[9], x[8]); + xvec.hi.hi = _mm_set_epi32(x[15], x[14], x[13], x[12]); +#endif + return xvec; +} +#endif + +#if defined(ARCH_64_BIT) +static really_inline +m512 loadcompressed512_64bit(const void *ptr, m512 mvec) { + // First, decompose our vectors into 64-bit chunks. + u64a m[8]; + memcpy(m, &mvec, sizeof(mvec)); + + u32 bits[8] = { popcount64(m[0]), popcount64(m[1]), + popcount64(m[2]), popcount64(m[3]), + popcount64(m[4]), popcount64(m[5]), + popcount64(m[6]), popcount64(m[7]) }; + u64a v[8]; + + unpack_bits_64(v, (const u8 *)ptr, bits, 8); + + u64a x[8] = { expand64(v[0], m[0]), expand64(v[1], m[1]), + expand64(v[2], m[2]), expand64(v[3], m[3]), + expand64(v[4], m[4]), expand64(v[5], m[5]), + expand64(v[6], m[6]), expand64(v[7], m[7]) }; + +#if defined(HAVE_AVX512) + m512 xvec = _mm512_set_epi64(x[7], x[6], x[5], x[4], + x[3], x[2], x[1], x[0]); +#elif defined(HAVE_AVX2) + m512 xvec = { .lo = _mm256_set_epi64x(x[3], x[2], x[1], x[0]), + .hi = _mm256_set_epi64x(x[7], x[6], x[5], x[4])}; +#else + m512 xvec = { .lo = { _mm_set_epi64x(x[1], x[0]), + _mm_set_epi64x(x[3], x[2]) }, + .hi = { _mm_set_epi64x(x[5], x[4]), + _mm_set_epi64x(x[7], x[6]) } }; +#endif + return xvec; +} +#endif + +void loadcompressed512(m512 *x, const void *ptr, const m512 *m, + UNUSED u32 bytes) { +#if defined(ARCH_64_BIT) + *x = loadcompressed512_64bit(ptr, *m); +#else + *x = loadcompressed512_32bit(ptr, *m); +#endif +} diff --git a/regex/util/state_compress.h b/regex/util/state_compress.h new file mode 100644 index 000000000..a17d2355c --- /dev/null +++ b/regex/util/state_compress.h @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2015, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Mask-based state compression, used by the NFA. + */ + +#ifndef STATE_COMPRESS_H +#define STATE_COMPRESS_H + +#include "simd_utils.h" +#include "ue2common.h" + +#ifdef __cplusplus +extern "C" +{ +#endif + +/* Note: bytes is not used by implementations >= 128 */ + +void storecompressed32(void *ptr, const u32 *x, const u32 *m, u32 bytes); +void loadcompressed32(u32 *x, const void *ptr, const u32 *m, u32 bytes); + +void storecompressed64(void *ptr, const u64a *x, const u64a *m, u32 bytes); +void loadcompressed64(u64a *x, const void *ptr, const u64a *m, u32 bytes); + +void storecompressed128(void *ptr, const m128 *x, const m128 *m, u32 bytes); +void loadcompressed128(m128 *x, const void *ptr, const m128 *m, u32 bytes); + +void storecompressed256(void *ptr, const m256 *x, const m256 *m, u32 bytes); +void loadcompressed256(m256 *x, const void *ptr, const m256 *m, u32 bytes); + +void storecompressed384(void *ptr, const m384 *x, const m384 *m, u32 bytes); +void loadcompressed384(m384 *x, const void *ptr, const m384 *m, u32 bytes); + +void storecompressed512(void *ptr, const m512 *x, const m512 *m, u32 bytes); +void loadcompressed512(m512 *x, const void *ptr, const m512 *m, u32 bytes); + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif diff --git a/regex/util/unaligned.h b/regex/util/unaligned.h new file mode 100644 index 000000000..299e5677c --- /dev/null +++ b/regex/util/unaligned.h @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2015, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Helper functions for unaligned loads and stores. + */ + +#ifndef UNALIGNED_H +#define UNALIGNED_H + +#include "ue2common.h" + +#if !defined(_WIN32) +#define PACKED__MAY_ALIAS __attribute__((packed, may_alias)) +#else +#define PACKED__MAY_ALIAS +#pragma pack(push, 1) // pack everything until told otherwise +#endif + +/// Perform an unaligned 16-bit load +static really_inline +u16 unaligned_load_u16(const void *ptr) { + struct unaligned { u16 u; } PACKED__MAY_ALIAS; + const struct unaligned *uptr = (const struct unaligned *)ptr; + return uptr->u; +} + +/// Perform an unaligned 32-bit load +static really_inline +u32 unaligned_load_u32(const void *ptr) { + struct unaligned { u32 u; } PACKED__MAY_ALIAS; + const struct unaligned *uptr = (const struct unaligned *)ptr; + return uptr->u; +} + +/// Perform an unaligned 64-bit load +static really_inline +u64a unaligned_load_u64a(const void *ptr) { + struct unaligned { u64a u; } PACKED__MAY_ALIAS; + const struct unaligned *uptr = (const struct unaligned *)ptr; + return uptr->u; +} + +/// Perform an unaligned 16-bit store +static really_inline +void unaligned_store_u16(void *ptr, u16 val) { + struct unaligned { u16 u; } PACKED__MAY_ALIAS; + struct unaligned *uptr = (struct unaligned *)ptr; + uptr->u = val; +} + +/// Perform an unaligned 32-bit store +static really_inline +void unaligned_store_u32(void *ptr, u32 val) { + struct unaligned { u32 u; } PACKED__MAY_ALIAS; + struct unaligned *uptr = (struct unaligned *)ptr; + uptr->u = val; +} + +/// Perform an unaligned 64-bit store +static really_inline +void unaligned_store_u64a(void *ptr, u64a val) { + struct unaligned { u64a u; } PACKED__MAY_ALIAS; + struct unaligned *uptr = (struct unaligned *)ptr; + uptr->u = val; +} +#if defined(_WIN32) +#pragma pack(pop) +#endif // win32 + +#undef PACKED__MAY_ALIAS + +#endif // UNALIGNED_H diff --git a/regex/util/uniform_ops.h b/regex/util/uniform_ops.h new file mode 100644 index 000000000..262104aca --- /dev/null +++ b/regex/util/uniform_ops.h @@ -0,0 +1,243 @@ +/* + * Copyright (c) 2015-2020, Intel Corporation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Uniformly-named primitives named by target type. + * + * The following are a set of primitives named by target type, so that we can + * macro the hell out of all our NFA implementations. Hurrah! + */ + +#ifndef UNIFORM_OPS_H +#define UNIFORM_OPS_H + +#include "ue2common.h" +#include "simd_utils.h" +#include "unaligned.h" + +// Aligned loads +#define load_u8(a) (*(const u8 *)(a)) +#define load_u16(a) (*(const u16 *)(a)) +#define load_u32(a) (*(const u32 *)(a)) +#define load_u64a(a) (*(const u64a *)(a)) +#define load_m128(a) load128(a) +#define load_m256(a) load256(a) +#define load_m384(a) load384(a) +#define load_m512(a) load512(a) + +// Unaligned loads +#define loadu_u8(a) (*(const u8 *)(a)) +#define loadu_u16(a) unaligned_load_u16((const u8 *)(a)) +#define loadu_u32(a) unaligned_load_u32((const u8 *)(a)) +#define loadu_u64a(a) unaligned_load_u64a((const u8 *)(a)) +#define loadu_m128(a) loadu128(a) +#define loadu_m256(a) loadu256(a) +#define loadu_m384(a) loadu384(a) +#define loadu_m512(a) loadu512(a) + +// Aligned stores +#define store_u8(ptr, a) do { *(u8 *)(ptr) = (a); } while(0) +#define store_u16(ptr, a) do { *(u16 *)(ptr) = (a); } while(0) +#define store_u32(ptr, a) do { *(u32 *)(ptr) = (a); } while(0) +#define store_u64a(ptr, a) do { *(u64a *)(ptr) = (a); } while(0) +#define store_m128(ptr, a) store128(ptr, a) +#define store_m256(ptr, a) store256(ptr, a) +#define store_m384(ptr, a) store384(ptr, a) +#define store_m512(ptr, a) store512(ptr, a) + +// Unaligned stores +#define storeu_u8(ptr, a) do { *(u8 *)(ptr) = (a); } while(0) +#define storeu_u16(ptr, a) unaligned_store_u16(ptr, a) +#define storeu_u32(ptr, a) unaligned_store_u32(ptr, a) +#define storeu_u64a(ptr, a) unaligned_store_u64a(ptr, a) +#define storeu_m128(ptr, a) storeu128(ptr, a) + +#define zero_u8 0 +#define zero_u32 0 +#define zero_u64a 0 +#define zero_m128 zeroes128() +#define zero_m256 zeroes256() +#define zero_m384 zeroes384() +#define zero_m512 zeroes512() + +#define ones_u8 0xff +#define ones_u32 0xfffffffful +#define ones_u64a 0xffffffffffffffffull +#define ones_m128 ones128() +#define ones_m256 ones256() +#define ones_m384 ones384() +#define ones_m512 ones512() + +#define or_u8(a, b) ((a) | (b)) +#define or_u32(a, b) ((a) | (b)) +#define or_u64a(a, b) ((a) | (b)) +#define or_m128(a, b) (or128(a, b)) +#define or_m256(a, b) (or256(a, b)) +#define or_m384(a, b) (or384(a, b)) +#define or_m512(a, b) (or512(a, b)) + +#if defined(HAVE_AVX512VBMI) +#define expand_m128(a) (expand128(a)) +#define expand_m256(a) (expand256(a)) +#define expand_m384(a) (expand384(a)) +#define expand_m512(a) (a) + +#define shuffle_byte_m128(a, b) (pshufb_m512(b, a)) +#define shuffle_byte_m256(a, b) (vpermb512(a, b)) +#define shuffle_byte_m384(a, b) (vpermb512(a, b)) +#define shuffle_byte_m512(a, b) (vpermb512(a, b)) +#endif + +#define and_u8(a, b) ((a) & (b)) +#define and_u32(a, b) ((a) & (b)) +#define and_u64a(a, b) ((a) & (b)) +#define and_m128(a, b) (and128(a, b)) +#define and_m256(a, b) (and256(a, b)) +#define and_m384(a, b) (and384(a, b)) +#define and_m512(a, b) (and512(a, b)) + +#define not_u8(a) (~(a)) +#define not_u32(a) (~(a)) +#define not_u64a(a) (~(a)) +#define not_m128(a) (not128(a)) +#define not_m256(a) (not256(a)) +#define not_m384(a) (not384(a)) +#define not_m512(a) (not512(a)) + +#define andnot_u8(a, b) ((~(a)) & (b)) +#define andnot_u32(a, b) ((~(a)) & (b)) +#define andnot_u64a(a, b) ((~(a)) & (b)) +#define andnot_m128(a, b) (andnot128(a, b)) +#define andnot_m256(a, b) (andnot256(a, b)) +#define andnot_m384(a, b) (andnot384(a, b)) +#define andnot_m512(a, b) (andnot512(a, b)) + +#define lshift_u32(a, b) ((a) << (b)) +#define lshift_u64a(a, b) ((a) << (b)) +#define lshift_m128(a, b) (lshift64_m128(a, b)) +#define lshift_m256(a, b) (lshift64_m256(a, b)) +#define lshift_m384(a, b) (lshift64_m384(a, b)) +#define lshift_m512(a, b) (lshift64_m512(a, b)) + +#define isZero_u8(a) ((a) == 0) +#define isZero_u32(a) ((a) == 0) +#define isZero_u64a(a) ((a) == 0) +#define isZero_m128(a) (!isnonzero128(a)) +#define isZero_m256(a) (!isnonzero256(a)) +#define isZero_m384(a) (!isnonzero384(a)) +#define isZero_m512(a) (!isnonzero512(a)) + +#define isNonZero_u8(a) ((a) != 0) +#define isNonZero_u32(a) ((a) != 0) +#define isNonZero_u64a(a) ((a) != 0) +#define isNonZero_m128(a) (isnonzero128(a)) +#define isNonZero_m256(a) (isnonzero256(a)) +#define isNonZero_m384(a) (isnonzero384(a)) +#define isNonZero_m512(a) (isnonzero512(a)) + +#define diffrich_u32(a, b) ((a) != (b)) +#define diffrich_u64a(a, b) ((a) != (b) ? 3 : 0) //TODO: impl 32bit granularity +#define diffrich_m128(a, b) (diffrich128(a, b)) +#define diffrich_m256(a, b) (diffrich256(a, b)) +#define diffrich_m384(a, b) (diffrich384(a, b)) +#define diffrich_m512(a, b) (diffrich512(a, b)) + +#define diffrich64_u32(a, b) ((a) != (b)) +#define diffrich64_u64a(a, b) ((a) != (b) ? 1 : 0) +#define diffrich64_m128(a, b) (diffrich64_128(a, b)) +#define diffrich64_m256(a, b) (diffrich64_256(a, b)) +#define diffrich64_m384(a, b) (diffrich64_384(a, b)) +#define diffrich64_m512(a, b) (diffrich64_512(a, b)) + +#define noteq_u8(a, b) ((a) != (b)) +#define noteq_u32(a, b) ((a) != (b)) +#define noteq_u64a(a, b) ((a) != (b)) +#define noteq_m128(a, b) (diff128(a, b)) +#define noteq_m256(a, b) (diff256(a, b)) +#define noteq_m384(a, b) (diff384(a, b)) +#define noteq_m512(a, b) (diff512(a, b)) + +#define partial_store_m128(ptr, v, sz) storebytes128(ptr, v, sz) +#define partial_store_m256(ptr, v, sz) storebytes256(ptr, v, sz) +#define partial_store_m384(ptr, v, sz) storebytes384(ptr, v, sz) +#define partial_store_m512(ptr, v, sz) storebytes512(ptr, v, sz) + +#define partial_load_m128(ptr, sz) loadbytes128(ptr, sz) +#define partial_load_m256(ptr, sz) loadbytes256(ptr, sz) +#define partial_load_m384(ptr, sz) loadbytes384(ptr, sz) +#define partial_load_m512(ptr, sz) loadbytes512(ptr, sz) + +#define store_compressed_u32(ptr, x, m, len) storecompressed32(ptr, x, m, len) +#define store_compressed_u64a(ptr, x, m, len) storecompressed64(ptr, x, m, len) +#define store_compressed_m128(ptr, x, m, len) storecompressed128(ptr, x, m, len) +#define store_compressed_m256(ptr, x, m, len) storecompressed256(ptr, x, m, len) +#define store_compressed_m384(ptr, x, m, len) storecompressed384(ptr, x, m, len) +#define store_compressed_m512(ptr, x, m, len) storecompressed512(ptr, x, m, len) + +#define load_compressed_u32(x, ptr, m, len) loadcompressed32(x, ptr, m, len) +#define load_compressed_u64a(x, ptr, m, len) loadcompressed64(x, ptr, m, len) +#define load_compressed_m128(x, ptr, m, len) loadcompressed128(x, ptr, m, len) +#define load_compressed_m256(x, ptr, m, len) loadcompressed256(x, ptr, m, len) +#define load_compressed_m384(x, ptr, m, len) loadcompressed384(x, ptr, m, len) +#define load_compressed_m512(x, ptr, m, len) loadcompressed512(x, ptr, m, len) + +static really_inline +void clearbit_u32(u32 *p, u32 n) { + assert(n < sizeof(*p) * 8); + *p &= ~(1U << n); +} + +static really_inline +void clearbit_u64a(u64a *p, u32 n) { + assert(n < sizeof(*p) * 8); + *p &= ~(1ULL << n); +} + +#define clearbit_m128(ptr, n) (clearbit128(ptr, n)) +#define clearbit_m256(ptr, n) (clearbit256(ptr, n)) +#define clearbit_m384(ptr, n) (clearbit384(ptr, n)) +#define clearbit_m512(ptr, n) (clearbit512(ptr, n)) + +static really_inline +char testbit_u32(u32 val, u32 n) { + assert(n < sizeof(val) * 8); + return !!(val & (1U << n)); +} + +static really_inline +char testbit_u64a(u64a val, u32 n) { + assert(n < sizeof(val) * 8); + return !!(val & (1ULL << n)); +} + +#define testbit_m128(val, n) (testbit128(val, n)) +#define testbit_m256(val, n) (testbit256(val, n)) +#define testbit_m384(val, n) (testbit384(val, n)) +#define testbit_m512(val, n) (testbit512(val, n)) + +#endif diff --git a/scripts/install_regex.sh b/scripts/install_regex.sh new file mode 100755 index 000000000..066065f47 --- /dev/null +++ b/scripts/install_regex.sh @@ -0,0 +1,56 @@ +#!/bin/bash + +INST_DIR="/tmp/Hyperscan" +mkdir $INST_DIR +cd $INST_DIR + +git clone https://github.com/adrian-thurston/colm.git +cd colm +./autogen.sh +./configure +make -j$(nproc) +make install + +cd $INST_DIR + +if [[ $LD_LIBRARY_PATH =~ "/usr/local/lib" ]]; then + echo "Path already set." +else + export LD_LIBRARY_PATH="/usr/local/lib" + TTT="$(cat /etc/environment | grep LD_LIBRARY_PATH)" + if [[ ! $TTT =~ "/usr/local/lib" ]]; then + echo "LD_LIBRARY_PATH=\"/usr/local/lib\"" >> /etc/environment + fi + +fi + + +git clone https://github.com/adrian-thurston/ragel.git +cd ragel +./autogen.sh +./configure --with-colm=/usr/local +make -j$(nproc) +make install + +cd $INST_DIR + +wget https://sourceforge.net/projects/pcre/files/pcre/8.45/pcre-8.45.tar.gz +tar -xf pcre-8.45.tar.gz + +cd pcre-8.45 +./configure --enable-pcre16 --enable-pcre32 +make -j$(nproc) +make install + +cd $INST_DIR + +git clone https://github.com/tempesta-tech/linux-regex-module.git +cd linux-regex-module +git checkout ag_changes_for_easy_installation + +cmake -DCMAKE_BUILD_TYPE=Release ./ +make -j$(nproc) +make install + +cd $INST_DIR + diff --git a/scripts/regex_start.sh b/scripts/regex_start.sh new file mode 100755 index 000000000..0b25320c7 --- /dev/null +++ b/scripts/regex_start.sh @@ -0,0 +1,28 @@ +#!/bin/bash + +rmdir /sys/kernel/config/rex/* 2> /dev/null + +script_path="$(dirname $0)" +tmp_path="/tmp/tempesta" + +echo "Start compilation of regex." > /dev/kmsg + +for filename in ${tmp_path}/*.txt; do + name=$(basename "$filename" .txt) + if [[ "$name" != "*" ]]; then + db_path="/sys/kernel/config/rex/${name}" + + rm -rf ${tmp_path}/out/ && mkdir ${tmp_path}/out + #${script_path}/hscollider -e ${filename} -ao ${tmp_path}/out/ -n1 #this version for single block strings + #${script_path}/hscollider -e ${filename} -V5 -ao ${tmp_path}/out/ -n1 #this version starts hscollider from scripts directory + hscollider -e ${filename} -V5 -ao ${tmp_path}/out/ -n1 + + mkdir $db_path + dd if=$(echo ${tmp_path}/out/*.db) of=${db_path}/database + cat "${filename}" > ${db_path}/note + echo "$name" > ${db_path}/id + fi +done + +echo "Compilation of regex files is complete." > /dev/kmsg + diff --git a/scripts/regex_stop.sh b/scripts/regex_stop.sh new file mode 100755 index 000000000..740ab8cc4 --- /dev/null +++ b/scripts/regex_stop.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +script_path="$(dirname $0)" + +rm -f /tmp/tempesta/*.txt +rm -rf /tmp/tempesta/out +#rmdir -p /sys/kernel/config/rex/* +rmdir /sys/kernel/config/rex/* \ No newline at end of file diff --git a/scripts/tempesta.sh b/scripts/tempesta.sh index e46ad6c0a..a9ed7f7df 100755 --- a/scripts/tempesta.sh +++ b/scripts/tempesta.sh @@ -35,6 +35,7 @@ fi script_path="$(dirname $0)" tdb_path=${TDB_PATH:="$TFW_ROOT/db/core"} +rgx_path=${REGEX_PATH:="$TFW_ROOT/regex"} tfw_path=${TFW_PATH:="$TFW_ROOT/fw"} tls_path=${TLS_PATH:="$TFW_ROOT/tls"} lib_path=${LIB_PATH:="$TFW_ROOT/lib"} @@ -45,6 +46,7 @@ lib_mod=tempesta_lib tls_mod=tempesta_tls tdb_mod=tempesta_db tfw_mod=tempesta_fw +rgx_mod=xdp_rex declare -r LONG_OPTS="help,load,unload,start,stop,restart,reload" # Exclude loopback interface since it needn't any tuning here: it hasn't RSS @@ -131,6 +133,7 @@ load_modules() { echo "Loading Tempesta kernel modules..." + mkdir /tmp/tempesta # Set verbose kernel logging, # so debug messages are shown on serial console as well. echo '8 7 1 7' > /proc/sys/kernel/printk @@ -144,6 +147,9 @@ load_modules() load_one_module "$tdb_path/$tdb_mod.ko" || error "cannot load tempesta database module" + load_one_module "$rgx_path/$rgx_mod.ko" || + error "cannot load regex module" + load_one_module "$tfw_path/$tfw_mod.ko" "tfw_cfg_path=$tfw_cfg_temp" || error "cannot load tempesta module" } @@ -153,6 +159,8 @@ unload_modules() echo "Un-loading Tempesta kernel modules..." rmmod $tfw_mod + $script_path/regex_stop.sh + rmmod $rgx_mod rmmod $tdb_mod rmmod $tls_mod rmmod $lib_mod @@ -284,6 +292,7 @@ start() unload_modules error "cannot start Tempesta FW (sysctl message: ${err##*: }), please check dmesg" else + $script_path/regex_start.sh echo "done" fi remove_tmp_conf @@ -307,10 +316,12 @@ reload() { update_js_challenge_templates echo "Running live reconfiguration of Tempesta..." + $script_path/regex_stop.sh err=$(start_tempesta_and_check) if [[ $err != "0" ]]; then error "cannot reconfigure Tempesta FW (sysctl message: ${err##*: }), please check dmesg" else + $script_path/regex_start.sh echo "done" remove_tmp_conf fi