diff --git a/CHANGELOG.mdx b/CHANGELOG.mdx
index 887085775907..8b4f6e9c9c41 100644
--- a/CHANGELOG.mdx
+++ b/CHANGELOG.mdx
@@ -2788,9 +2788,9 @@ Upgrade dependent libraries with security vulnerabilities (Pillow, TensorFlow, u
 
 - [#6285](https://github.com/rasahq/rasa/issues/6285): Predictions of the [`FallbackClassifier`](./building-classic-assistants/components.mdx#fallbackclassifier) are
   ignored when
-  [evaluating the NLU model](./production/testing-your-assistant.mdx#evaluating-an-nlu-model)
+  [evaluating the NLU model](./building-classic-assistants/testing-your-assistant.mdx#evaluating-an-nlu-model)
   Note that the `FallbackClassifier` predictions still apply to
-  [test stories](./production/testing-your-assistant.mdx#writing-test-stories).
+  [test stories](./building-classic-assistants/testing-your-assistant.mdx#writing-test-stories).
 - [#6474](https://github.com/rasahq/rasa/issues/6474): Adapt the training data reader and emulator for wit.ai to their latest format.
 - [#6498](https://github.com/rasahq/rasa/issues/6498): Adding configurable prefixes to Redis [Tracker](./production/tracker-stores.mdx) and [Lock Stores](./production/lock-stores.mdx) so that a single Redis instance (and logical DB) can support multiple conversation trackers and locks.
   By default, conversations will be prefixed with `tracker:...` and all locks prefixed with `lock:...`. Additionally, you can add an alphanumeric-only `prefix: value` in `endpoints.yml` such that keys in redis will take the form `value:tracker:...` and `value:lock:...` respectively.
@@ -4164,7 +4164,7 @@ This can help to fix problems when using `rasa shell` with custom actions which
 
 ### Improved Documentation
 
-- [#2237](https://github.com/rasahq/rasa/issues/2237): Restructure the “Evaluating models” documentation page and rename this page to [Testing Your Assistant](./production/testing-your-assistant.mdx).
+- [#2237](https://github.com/rasahq/rasa/issues/2237): Restructure the “Evaluating models” documentation page and rename this page to [Testing Your Assistant](./building-classic-assistants/testing-your-assistant.mdx).
 
 - [#5302](https://github.com/rasahq/rasa/issues/5302): Improved documentation on how to build and deploy an action server image for use on other servers such as Rasa X deployments.
 
diff --git a/docs/docs/building-classic-assistants/business-logic.mdx b/docs/docs/building-classic-assistants/business-logic.mdx
index 1d692194f733..89a65342bd9d 100644
--- a/docs/docs/building-classic-assistants/business-logic.mdx
+++ b/docs/docs/building-classic-assistants/business-logic.mdx
@@ -5,6 +5,10 @@ title: Handling Business Logic
 abstract: Conversational assistants often need to ask users for information in order to help them. You can use Forms to collect the required user information and fulfill a request.
 ---
 
+import RasaNLUBasedBanner from "@theme/RasaNLUBasedBanner";
+
+<RasaNLUBasedBanner />
+
 Conversational assistants often support user goals that involve collecting required information
 from the user before doing something for them. For example, a restaurant search bot would need to gather a few pieces of information
 about the user's preferences to find them a suitable restaurant:
diff --git a/docs/docs/building-classic-assistants/chitchat-faqs.mdx b/docs/docs/building-classic-assistants/chitchat-faqs.mdx
index 5bfcf4542d89..ea32d5e1d8d9 100644
--- a/docs/docs/building-classic-assistants/chitchat-faqs.mdx
+++ b/docs/docs/building-classic-assistants/chitchat-faqs.mdx
@@ -5,6 +5,10 @@ title: Chitchat and FAQs
 abstract: FAQ assistants are the simplest assistants to build and typically the first kind of assistant anyone builds. This page is a guide to the concepts and training data you need to handle non-contextual questions like FAQs and chitchat.
 ---
 
+import RasaNLUBasedBanner from "@theme/RasaNLUBasedBanner";
+
+<RasaNLUBasedBanner />
+
 [FAQs](./glossary.mdx#faqs) and [chitchat](./glossary.mdx#chitchat) are two cases where the conversational assistant responds with a fixed set of messages, and the assistant should always answer the same way,
 no matter what has happened previously in the conversation.
 For example, in the following conversation, every question can be asked at any point in the conversation,
diff --git a/docs/docs/building-classic-assistants/components.mdx b/docs/docs/building-classic-assistants/components.mdx
index d04158e6a03d..550aebb76df0 100644
--- a/docs/docs/building-classic-assistants/components.mdx
+++ b/docs/docs/building-classic-assistants/components.mdx
@@ -8,6 +8,10 @@ abstract:
   pre-processing, and more.
 ---
 
+import RasaNLUBasedBanner from "@theme/RasaNLUBasedBanner";
+
+<RasaNLUBasedBanner />
+
 ## Language Models
 
 The following components load pre-trained models that are needed if you want to use pre-trained
diff --git a/docs/docs/building-classic-assistants/contextual-conversations.mdx b/docs/docs/building-classic-assistants/contextual-conversations.mdx
index a91293fd2655..f2b94f1274ef 100644
--- a/docs/docs/building-classic-assistants/contextual-conversations.mdx
+++ b/docs/docs/building-classic-assistants/contextual-conversations.mdx
@@ -5,6 +5,10 @@ title: Contextual Conversations
 abstract: Taking context into account is often key to providing a good user experience. This page is a guide to creating contextual conversation patterns.
 ---
 
+import RasaNLUBasedBanner from "@theme/RasaNLUBasedBanner";
+
+<RasaNLUBasedBanner />
+
 In a contextual conversation, something beyond the previous step in the conversation
 plays a role in what should happen next. For example, if a user asks "How many?",
 it's not clear from the message alone what the user is asking about. In the context of the assistant saying,
diff --git a/docs/docs/building-classic-assistants/domain.mdx b/docs/docs/building-classic-assistants/domain.mdx
index b5aad8fe3d27..95e399b28887 100644
--- a/docs/docs/building-classic-assistants/domain.mdx
+++ b/docs/docs/building-classic-assistants/domain.mdx
@@ -5,6 +5,10 @@ title: Domain
 abstract: The domain defines the universe in which your assistant operates. It specifies the intents, entities, slots, responses, forms, and actions your bot should know about. It also defines a configuration for conversation sessions.
 ---
 
+import RasaNLUBasedBanner from "@theme/RasaNLUBasedBanner";
+
+<RasaNLUBasedBanner />
+
 Here is a full example of a domain, taken from the
 [concertbot](https://github.com/RasaHQ/rasa/tree/main/examples/concertbot) example:
 
diff --git a/docs/docs/building-classic-assistants/fallback-handoff.mdx b/docs/docs/building-classic-assistants/fallback-handoff.mdx
index ce76e5e5ffab..b2f3c190febb 100644
--- a/docs/docs/building-classic-assistants/fallback-handoff.mdx
+++ b/docs/docs/building-classic-assistants/fallback-handoff.mdx
@@ -6,6 +6,9 @@ abstract: This is a guide on how to handle various failures of your assistant.
 ---
 
 import useBaseUrl from "@docusaurus/useBaseUrl";
+import RasaNLUBasedBanner from "@theme/RasaNLUBasedBanner";
+
+<RasaNLUBasedBanner />
 
 Even if you design your bot perfectly, users will inevitably say things to your
 assistant that you did not anticipate. In these cases, your assistant will fail,
diff --git a/docs/docs/building-classic-assistants/forms.mdx b/docs/docs/building-classic-assistants/forms.mdx
index f22bd0587b9c..d06c1c62888d 100644
--- a/docs/docs/building-classic-assistants/forms.mdx
+++ b/docs/docs/building-classic-assistants/forms.mdx
@@ -6,6 +6,10 @@ description: Follow a rule-based process of information gathering using forms in
 abstract: One of the most common conversation patterns is to collect a few pieces of information from a user in order to do something (book a restaurant, call an API, search a database, etc.). This is also called **slot filling**.
 ---
 
+import RasaNLUBasedBanner from "@theme/RasaNLUBasedBanner";
+
+<RasaNLUBasedBanner />
+
 ## Usage
 
 To use forms with Rasa you need to make sure that the
diff --git a/docs/docs/building-classic-assistants/glossary.mdx b/docs/docs/building-classic-assistants/glossary.mdx
index a7a9df1af10f..d365c0251585 100644
--- a/docs/docs/building-classic-assistants/glossary.mdx
+++ b/docs/docs/building-classic-assistants/glossary.mdx
@@ -5,6 +5,10 @@ title: Rasa Glossary
 description: Glossary for all Rasa-related terms
 ---
 
+import RasaNLUBasedBanner from "@theme/RasaNLUBasedBanner";
+
+<RasaNLUBasedBanner />
+
 ## [Action](../concepts/custom-actions.mdx)
 
 A single step that a bot takes in a conversation (e.g. calling an API or sending a response back to the user).
@@ -40,7 +44,7 @@ A way to store bot responses externally instead of including them directly in th
 
 The process of using user messages and conversation data to influence the design of an assistant and train the model, combined with engineering best practices. There are 6 steps that make up CDD: Share, Review, Annotate, Fix, Track, and Test.
 
-## [Conversation Tests](../production/testing-your-assistant.mdx)
+## [Conversation Tests](./testing-your-assistant.mdx)
 
 Modified story format that includes the full text of the user message in addition to the intent label. Test conversations are saved to a test set file (conversation_tests.md), which is used to evaluate the model’s predictions across an entire conversation.
 
diff --git a/docs/docs/building-classic-assistants/language-support.mdx b/docs/docs/building-classic-assistants/language-support.mdx
index 35e2006b689f..49407b3b39cd 100644
--- a/docs/docs/building-classic-assistants/language-support.mdx
+++ b/docs/docs/building-classic-assistants/language-support.mdx
@@ -5,6 +5,10 @@ title: Language Support
 abstract: You can use Rasa to build assistants in any language you want.
 ---
 
+import RasaNLUBasedBanner from "@theme/RasaNLUBasedBanner";
+
+<RasaNLUBasedBanner />
+
 Your Rasa assistant can be used on training data in **any language**.
 If there are no word embeddings for your language, you can train your featurizers
 from scratch with the data you provide.
diff --git a/docs/docs/building-classic-assistants/model-configuration.mdx b/docs/docs/building-classic-assistants/model-configuration.mdx
index 0f378b29ec4d..0e93da2a0b81 100644
--- a/docs/docs/building-classic-assistants/model-configuration.mdx
+++ b/docs/docs/building-classic-assistants/model-configuration.mdx
@@ -6,6 +6,10 @@ description: Learn about model configuration for Rasa.
 abstract: The configuration file defines the components and policies that your model will use to make predictions based on user input.
 ---
 
+import RasaNLUBasedBanner from "@theme/RasaNLUBasedBanner";
+
+<RasaNLUBasedBanner />
+
 The recipe key allows for different types of config and model architecture.
 Currently, "default.v1" and the experimental "graph.v1" recipes are supported.
 
diff --git a/docs/docs/building-classic-assistants/nlu-only.mdx b/docs/docs/building-classic-assistants/nlu-only.mdx
index a42b1aa6f5dd..50544c0bc09f 100644
--- a/docs/docs/building-classic-assistants/nlu-only.mdx
+++ b/docs/docs/building-classic-assistants/nlu-only.mdx
@@ -5,6 +5,10 @@ title: Using NLU Only
 abstract: Find out how to use only Rasa NLU as a standalone NLU service for your chatbot or virtual assistant.
 ---
 
+import RasaNLUBasedBanner from "@theme/RasaNLUBasedBanner";
+
+<RasaNLUBasedBanner />
+
 If you want to use Rasa only as an NLU component, you can!
 
 ## Training NLU-only models
diff --git a/docs/docs/building-classic-assistants/nlu-training-data.mdx b/docs/docs/building-classic-assistants/nlu-training-data.mdx
index 3de1afabadfd..b305405c7df7 100644
--- a/docs/docs/building-classic-assistants/nlu-training-data.mdx
+++ b/docs/docs/building-classic-assistants/nlu-training-data.mdx
@@ -6,6 +6,10 @@ description: Read more about how to format training data with Rasa NLU for open
 abstract: NLU training data stores structured information about user messages.
 ---
 
+import RasaNLUBasedBanner from "@theme/RasaNLUBasedBanner";
+
+<RasaNLUBasedBanner />
+
 The goal of NLU (Natural Language Understanding) is to extract structured information from user messages. This usually includes the user's [intent](./glossary.mdx#intent) and any
 [entities](./glossary.mdx#entity) their message contains. You can
 add extra information such as [regular expressions](#regular-expressions) and [lookup tables](#lookup-tables) to your
diff --git a/docs/docs/building-classic-assistants/policies.mdx b/docs/docs/building-classic-assistants/policies.mdx
index a073424a7cc2..01b23cebc033 100644
--- a/docs/docs/building-classic-assistants/policies.mdx
+++ b/docs/docs/building-classic-assistants/policies.mdx
@@ -5,6 +5,10 @@ title: Policies
 abstract: NLU-based Rasa assistants use a variety of policies to decide which action to take at each step in a conversation. There are machine-learning and rule-based policies that your assistant can use in tandem.
 ---
 
+import RasaNLUBasedBanner from "@theme/RasaNLUBasedBanner";
+
+<RasaNLUBasedBanner />
+
 When building an NLU-based assistant in Rasa, there are a number of dialogue policies available to you.
 You can customize the policies your assistant uses by specifying the `policies`
 key in your project's `config.yml`.
diff --git a/docs/docs/building-classic-assistants/reaching-out-to-user.mdx b/docs/docs/building-classic-assistants/reaching-out-to-user.mdx
index f3f9512ea5ad..c065826fd7f0 100644
--- a/docs/docs/building-classic-assistants/reaching-out-to-user.mdx
+++ b/docs/docs/building-classic-assistants/reaching-out-to-user.mdx
@@ -5,6 +5,10 @@ title: Reaching Out to the User
 abstract: Sometimes you want your assistant to reach out to the user without the user's prompting. For example, you might want the assistant to send a message when the user opens the chat window, or you might want to prompt the user if they haven't sent a message for a while. This page is a guide to enabling your assistant to reach out to the user proactively.
 ---
 
+import RasaNLUBasedBanner from "@theme/RasaNLUBasedBanner";
+
+<RasaNLUBasedBanner />
+
 ## Reaching out first
 
 In most use cases, when the user opens the chat window with your assistant, you will want the
diff --git a/docs/docs/building-classic-assistants/rules.mdx b/docs/docs/building-classic-assistants/rules.mdx
index a854d91ad9a4..3669b60fb71c 100644
--- a/docs/docs/building-classic-assistants/rules.mdx
+++ b/docs/docs/building-classic-assistants/rules.mdx
@@ -8,6 +8,10 @@ abstract:
   Rules describe short pieces of conversations that should always follow the same path.
 ---
 
+import RasaNLUBasedBanner from "@theme/RasaNLUBasedBanner";
+
+<RasaNLUBasedBanner />
+
 **Don't overuse rules**. Rules are great to handle small specific conversation patterns, but
 unlike [stories](./stories.mdx), rules don't have the power to generalize to unseen conversation
 paths. Combine rules and stories to make your assistant robust and able to handle
diff --git a/docs/docs/building-classic-assistants/stories.mdx b/docs/docs/building-classic-assistants/stories.mdx
index 97cc72f1c8a5..6aefbf15893d 100644
--- a/docs/docs/building-classic-assistants/stories.mdx
+++ b/docs/docs/building-classic-assistants/stories.mdx
@@ -8,6 +8,10 @@ abstract:
   model. Stories can be used to train models that are able to generalize to unseen conversation paths.
 ---
 
+import RasaNLUBasedBanner from "@theme/RasaNLUBasedBanner";
+
+<RasaNLUBasedBanner />
+
 ## Format
 
 A story is a representation of a conversation between a user and an AI assistant,
@@ -200,7 +204,7 @@ Overusing OR statements will slow down training.
 ## Test Conversation Format
 
 The test conversation format is a format that combines both NLU data and stories
-into a single file for evaluation. Read more about this format in [Testing Your Assistant](../production/testing-your-assistant.mdx).
+into a single file for evaluation. Read more about this format in [Testing Your Assistant](./testing-your-assistant.mdx).
 
 :::caution testing only
 This format is only used for testing and cannot be used for training.
diff --git a/docs/docs/building-classic-assistants/testing-your-assistant.mdx b/docs/docs/building-classic-assistants/testing-your-assistant.mdx
new file mode 100644
index 000000000000..29a9095617a5
--- /dev/null
+++ b/docs/docs/building-classic-assistants/testing-your-assistant.mdx
@@ -0,0 +1,556 @@
+---
+id: testing-your-assistant
+sidebar_label: Testing Your Assistant
+title: Testing Your Assistant
+abstract:
+  Rasa lets you validate and test dialogues end-to-end by running through
+  test stories. In addition, you can
+  also test the dialogue management and the message processing (NLU)
+  separately.
+---
+
+import useBaseUrl from "@docusaurus/useBaseUrl";
+
+import RasaProLabel from "@theme/RasaProLabel";
+import RasaProBanner from "@theme/RasaProBanner";
+import RasaNLUBasedBanner from "@theme/RasaNLUBasedBanner";
+
+<RasaNLUBasedBanner />
+
+## Validating Data and Stories
+
+Data validation verifies that no mistakes or major inconsistencies appear in your domain, NLU
+data, or story data. To validate your data, have your CI run this command:
+
+```bash
+rasa data validate
+```
+
+If you pass a `max_history` value to one or more policies in your `config.yml` file, provide the
+smallest of those values as
+
+```bash
+rasa data validate --max-history <max_history>
+```
+
+If data validation results in errors, training a model can also fail or yield bad performance, so it's
+always good to run this check before training a model. By including the
+`--fail-on-warnings` flag, this step will fail on warnings indicating more minor issues.
+
+:::note
+Running `rasa data validate` does **not** test if your [rules](../building-classic-assistants/rules.mdx) are consistent with your stories.
+However, during training, the `RulePolicy` checks for conflicts between rules and stories. Any such conflict will abort training.
+:::
+
+To read more about the validator and all of the available options, see [the documentation for
+`rasa data validate`](../command-line-interface.mdx#rasa-data-validate).
+
+## Writing Test Stories
+
+Testing your trained model on test stories is the best way to have confidence in how your assistant
+will act in certain situations. Written in a modified story
+format, test stories allow you to provide entire conversations and test that, given certain
+user input, your model will behave in the expected manner. This is especially
+important as you start introducing more complicated stories from user
+conversations.
+
+Test stories are like
+the stories in your training data, but include the user message as well.
+
+Here are some examples:
+
+<Tabs values={[{"label": "Basics", "value": "basics"}, {"label": "Button Payload", "value": "buttons"}, {"label": "Custom Actions", "value": "customactions"}, {"label": "Forms Happy Path", "value": "formshappypath"}, {"label": "Forms Unhappy Path", "value": "formsunhappypath"}]} defaultValue="basics">
+  <TabItem value="basics">
+
+```yaml-rasa title="tests/test_stories.yml" {5,9,13}
+stories:
+- story: A basic story test
+  steps:
+  - user: |
+      hello
+    intent: greet
+  - action: utter_ask_howcanhelp
+  - user: |
+     show me [chinese]{"entity": "cuisine"} restaurants
+    intent: inform
+  - action: utter_ask_location
+  - user: |
+      in [Paris]{"entity": "location"}
+    intent: inform
+  - action: utter_ask_price
+```
+
+  </TabItem>
+  <TabItem value="buttons">
+
+```yaml-rasa title="tests/test_stories.yml" {8,11}
+stories:
+- story: A test where a user clicks on a button with payload
+  steps:
+  - user: |
+      hello
+    intent: greet
+  - action: utter_ask_howcanhelp
+  - user: /inform{{"cuisine":"chinese"}}
+    intent: inform
+  - action: utter_ask_location
+  - user: /inform{{"location":"Paris"}}
+    intent: inform
+  - action: utter_ask_price
+```
+
+  </TabItem>
+  <TabItem value="customactions">
+
+```yaml-rasa title="tests/test_stories.yml" {5,12}
+stories:
+- story: A test where a custom action returns events
+  steps:
+  - user: |
+      hey
+    intent: greet
+  - action: my_custom_action
+  - slot_was_set:
+    - my_slot: "value added by custom action"
+  - action: utter_ask_age
+  - user: |
+      thanks
+    intent: thankyou
+  - action: utter_no_worries
+```
+
+  </TabItem>
+  <TabItem value="formshappypath">
+
+```yaml-rasa title="tests/test_stories.yml" {5,9,14,20}
+stories:
+- story: A test story with a form
+  steps:
+  - user: |
+      hi
+    intent: greet
+  - action: utter_greet
+  - user: |
+      im looking for a restaurant
+    intent: request_restaurant
+  - action: restaurant_form
+  - active_loop: restaurant_form
+  - user: |
+      [afghan](cuisine) food
+    intent: inform
+  - action: restaurant_form
+  - active_loop: null
+  - action: utter_slots_values
+  - user: |
+      thanks
+    intent: thankyou
+  - action: utter_no_worries
+```
+
+  </TabItem>
+  <TabItem value="formsunhappypath">
+
+```yaml-rasa title="tests/test_stories.yml" {5,9,14,21}
+stories:
+- story: A test story with unexpected input during a form
+  steps:
+  - user: |
+      hi
+    intent: greet
+  - action: utter_greet
+  - user: |
+      im looking for a restaurant
+    intent: request_restaurant
+  - action: restaurant_form
+  - active_loop: restaurant_form
+  - user: |
+      How's the weather?
+    intent: chitchat
+  - action: utter_chitchat
+  - action: restaurant_form
+  - active_loop: null
+  - action: utter_slots_values
+  - user: |
+      thanks
+    intent: thankyou
+  - action: utter_no_worries
+```
+
+  </TabItem>
+  <TabItem value="entities">
+
+```yaml-rasa title="tests/test_stories.yml" {5,9,13}
+stories:
+- story: A basic test story with multiple entities for a single token
+  steps:
+  - user: |
+      hello
+    intent: greet
+  - action: utter_ask_howcanhelp
+  - user: |
+     show me [chinese]{"entity": "cuisine"} restaurants
+    intent: inform
+  - action: utter_ask_location
+  - user: |
+      in [Paris][{"entity": "location"}, {"entity": "city"}]
+    intent: inform
+  - action: utter_ask_price
+```
+
+  </TabItem>
+</Tabs>
+
+By default, the command will run tests on stories from any files with names starting with `test_`. You can also provide
+a specific test stories file or directory with the `--stories` argument.
+You can test your assistant against them by running:
+
+```bash
+rasa test
+```
+
+Conversation testing is only as thorough and accurate as the test
+cases you include, so you should continue to grow your set of test stories
+as you make improvements to your assistant. A good rule of thumb to follow is that you should aim for your test stories
+to be representative of the true distribution of real conversations.
+
+See the [CLI documentation on `rasa test`](../command-line-interface.mdx#rasa-test) for
+more configuration options.
+
+:::caution Testing Custom Actions
+[Custom Actions](../concepts/custom-actions.mdx) are not executed as part of test stories. If your custom
+actions append any events to the conversation, this has to be reflected in your test story
+(e.g. by adding `slot_was_set` events to your test story).
+
+To test the code of your custom actions, you should write unit tests
+for them and include these tests in your [CI/CD pipeline](../production/setting-up-ci-cd.mdx).
+
+:::
+
+## Evaluating an NLU Model
+
+In addition to testing stories, you can also test the natural language understanding (NLU) model separately.
+Once your assistant is deployed in the real world, it will be processing messages that it hasn't seen
+in the training data. To simulate this, you should always set aside some part of your data for testing.
+You can either:
+
+1. [use a held out test set](#using-a-held-out-test-set) by shuffling and splitting your NLU data
+
+2. [use cross-validation](#using-cross-validation), which automatically creates
+   multiple train/test splits
+
+### Using a Held-Out Test Set
+
+If you use the train-test
+set approach, it is best to [shuffle and split your data](../command-line-interface.mdx#rasa-data-split)
+using `rasa data split` every time you evaluate your model, as
+opposed to using a static NLU test set, which can easily become outdated.
+
+You can split your NLU data into train and test sets using:
+
+```bash
+rasa data split nlu
+```
+
+Next, you can see how well your trained NLU model predicts the
+data from the test set you generated, using:
+
+```bash {2}
+rasa test nlu
+    --nlu train_test_split/test_data.yml
+```
+
+### Using Cross-Validation
+
+If you've made significant changes to your NLU training data (e.g.
+splitting an intent into two intents or adding a lot of training examples), you should run a
+full NLU evaluation using cross-validation. Cross-validation automatically creates
+multiple train/test splits and averages the results of evaluations on each train/test split.
+This means all your data is evaluated during cross-validation, making cross-validation the most
+thorough way to automatically test your NLU model.
+
+To run NLU testing in cross-validation mode run:
+
+```bash {3}
+rasa test nlu
+    --nlu data/nlu
+    --cross-validation
+```
+
+You can specify the number of test/train splits used with the `-f/--folds` flag:
+
+```bash {4}
+rasa test nlu
+    --nlu data/nlu
+    --cross-validation
+    --folds 5
+```
+
+Note that during cross-validation, the NLU model will be trained for each fold,
+so cross-validation with a large data set and a high number of folds can be time-consuming.
+On a small data set, a high number of folds can result in too few examples per intent being available for each test split.
+
+On the other hand, if you specify a low number of folds, your data will be split into much larger chunks,
+and there will be proportionally less data to train on for each fold.
+
+Choose a number of folds that balances both considerations for your dataset size.
+
+:::tip hyperparameter tuning
+To further improve your model check out this
+[tutorial on hyperparameter tuning](https://blog.rasa.com/rasa-nlu-in-depth-part-3-hyperparameters/).
+:::
+
+### Comparing NLU Pipelines
+
+To get the most out of your training data, you should train and evaluate your model on different pipelines
+and different amounts of training data.
+
+To do so, pass multiple configuration files to the `rasa test` command:
+
+```bash {2}
+rasa test nlu --nlu data/nlu.yml
+   --config config_1.yml config_2.yml
+```
+
+This performs several steps:
+
+1. Create a global 80% train / 20% test split from `data/nlu.yml`.
+2. Exclude a certain percentage of data from the global train split.
+3. Train models for each configuration on remaining training data.
+4. Evaluate each model on the global test split.
+
+The above process is repeated with different percentages of training data in step 2
+to give you an idea of how each pipeline will behave if you increase the amount of training data.
+Since training is not completely deterministic, the whole process is repeated
+three times for each configuration specified.
+
+A graph with the mean and standard deviations of
+[f1-scores](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html)
+across all runs is plotted.
+The f1-score graph, along with all train/test sets, the trained models, classification and error reports,
+will be saved into a folder called `nlu_comparison_results`.
+
+Inspecting the f1-score graph can help you understand if you have enough data for your NLU model.
+If the graph shows that f1-score is still improving when all of the training data is used,
+it may improve further with more data. But if f1-score has plateaued when all training data is used,
+adding more data may not help.
+
+If you want to change the number of runs or exclusion percentages, you can:
+
+```bash {3}
+rasa test nlu --nlu data/nlu.yml
+  --config config_1.yml config_2.yml
+  --runs 4 --percentages 0 25 50 70 90
+```
+
+### Interpreting the Output
+
+#### Intent Classifiers
+
+The `rasa test` script will produce a report (`intent_report.json`), confusion matrix (`intent_confusion_matrix.png`)
+and confidence histogram (`intent_histogram.png`) for your intent classification model.
+
+The report logs [precision](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html),
+[recall](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.recall_score.html) and
+[f1-score](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html) for each intent,
+as well as providing an overall average. You can save these reports as JSON files using the `--report` argument.
+
+The confusion matrix shows which intents are mistaken for others.
+Any samples which have been incorrectly predicted are logged and saved to a file called `errors.json` for easier debugging.
+
+<div align="center">
+  <img
+    alt="image"
+    src={useBaseUrl("/img/intent_confusion_matrix_example.png")}
+    width="70%"
+  />
+</div>
+
+The histogram allows you to visualize the confidence for all predictions,
+with the correct and incorrect predictions being displayed by blue and red bars respectively.
+Improving the quality of your training data will move the blue histogram bars up the plot and the
+red histogram bars down the plot. It should also help in reducing the number of red histogram bars itself.
+
+<div align="center">
+  <img
+    alt="image"
+    src={useBaseUrl("/img/intent_histogram_example.png")}
+    width="70%"
+  />
+</div>
+
+#### Response Selectors
+
+`rasa test` evaluates response selectors in the same way that it evaluates intent classifiers, producing a
+report (`response_selection_report.json`), confusion matrix (`response_selection_confusion_matrix.png`),
+confidence histogram (`response_selection_histogram.png`) and errors (`response_selection_errors.json`).
+If your pipeline includes multiple response selectors, they are evaluated in a single report.
+
+The report logs precision, recall and f1 measure for
+each sub-intent of a [retrieval intent](../building-classic-assistants/glossary.mdx#retrieval-intent) and provides an overall average.
+You can save these reports as JSON files using the `--report` argument.
+
+#### Entity Extraction
+
+`rasa test` reports recall, precision, and f1-score for each entity type that
+your trainable entity extractors are trained to recognize.
+
+Only trainable entity extractors, such as the `DIETClassifier` and `CRFEntityExtractor` are
+evaluated by `rasa test`. Pretrained extractors like the `DucklingHTTPExtractor` are not evaluated.
+
+If you have multiple entity extractors in your pipeline, or use some custom extractors,
+multiple entities might be associated with the same token. In this case,
+you can use a list notation in the test files, such as
+
+```yaml
+stories:
+  - story: A basic test story with multiple entities for a single token
+    steps:
+      - user: |
+          I like [ice cream][{\"entity\": \"food\"}, {\"entity\": \"desert\"}]
+        intent: inform
+      # ...
+```
+
+:::caution incorrect entity annotations
+If any of your entities are incorrectly annotated, your evaluation may fail. One common problem
+is that an entity cannot stop or start inside a token.
+For example, if you have an example for a `name` entity
+like `[Brian](name)'s house`, this is only valid if your tokenizer splits `Brian's` into
+multiple tokens.
+
+:::
+
+#### Entity Scoring
+
+To evaluate entity extraction we apply a simple tag-based approach. We don't consider
+[BILOU tags](../building-classic-assistants/nlu-training-data.mdx#bilou-entity-tagging) exactly, but only the
+entity type tags on a per token basis. For location entity like “near Alexanderplatz” we
+expect the labels `LOC LOC` instead of the BILOU-based `B-LOC L-LOC`.
+
+Our approach is more lenient when it comes to evaluation, as it rewards
+partial extraction and does not penalize the splitting of entities.
+For example, given the aforementioned entity “near Alexanderplatz” and a system that extracts
+“Alexanderplatz”, our approach rewards the extraction of “Alexanderplatz” and penalizes the missed out word “near”.
+
+The BILOU-based approach, however, would label this as a complete failure since it expects Alexanderplatz
+to be labeled as a last token in an entity (`L-LOC`) instead of a single token entity (`U-LOC`). Note also that
+a split extraction of “near” and “Alexanderplatz” would get full scores on our approach and zero on the
+BILOU-based one.
+
+Here's a comparison between the two scoring mechanisms for the phrase “near Alexanderplatz tonight”:
+
+| extracted                                           | Simple tags (score) | BILOU tags (score)     |
+| --------------------------------------------------- | ------------------- | ---------------------- |
+| `[near Alexanderplatz](loc) [tonight](time)`        | loc loc time (3)    | B-loc L-loc U-time (3) |
+| `[near](loc) [Alexanderplatz](loc) [tonight](time)` | loc loc time (3)    | U-loc U-loc U-time (1) |
+| `near [Alexanderplatz](loc) [tonight](time)`        | O loc time (2)      | O U-loc U-time (1)     |
+| `[near](loc) Alexanderplatz [tonight](time)`        | loc O time (2)      | U-loc O U-time (1)     |
+| `[near Alexanderplatz tonight](loc)`                | loc loc loc (2)     | B-loc I-loc L-loc (1)  |
+
+## Evaluating a Dialogue Model
+
+You can evaluate your trained dialogue model on a set of test stories
+by using the test script:
+
+```bash
+rasa test core --stories test_stories.yml --out results
+```
+
+This will print any failed stories to `results/failed_test_stories.yml`.
+A story fails if at least one of the actions was predicted incorrectly.
+
+The test script will also save a confusion matrix to a file called
+`results/story_confmat.pdf`. For each action in your domain, the confusion
+matrix shows how often the action was correctly predicted and how often an
+incorrect action was predicted instead.
+
+### Interpreting the generated warnings
+
+The test script will also generate a warnings file called `results/stories_with_warnings.yml`.
+This file contains all test stories for which [`action_unlikely_intent`](../concepts/default-actions.mdx#action_unlikely_intent)
+was predicted at any conversation turn but all actions from the original story were predicted correctly.
+However, if a test story originally included an `action_unlikely_intent`, for example to ensure [a rule is designed to
+trigger the conversation path after an `action_unlikely_intent`](../concepts/default-actions.mdx#customization-1) but the ensemble of
+policies failed to do so, then the corresponding story will end up in `results/failed_test_stories.yml` as
+a failed story.
+
+The stories are sorted by the severity of `action_unlikely_intent`'s prediction.
+This severity is calculated by [`UnexpecTEDIntentPolicy`](../building-classic-assistants/policies.mdx#unexpected-intent-policy) itself at prediction time.
+The higher the severity, the more unlikely is the intent and hence reviewing that particular
+conversation path becomes more critical.
+
+Note, that `action_unlikely_intent` is predicted by
+`UnexpecTEDIntentPolicy` which employs a machine learning based model
+under the hood and hence can result in false warnings as well. You can choose to ignore such warnings
+if the conversation paths in these stories are already present in the training stories.
+
+### Comparing Policy Configurations
+
+To choose a configuration for your dialogue model, or to choose hyperparameters for a
+specific policy, you want to measure how well your dialogue model will generalize
+to conversations it hasn't seen before. Especially in the beginning
+of a project, when you don't have a lot of real conversations to train
+your bot on, you may not want to exclude some to use as a test set.
+
+Rasa has some scripts to help you choose and fine-tune your policy configuration.
+Once you are happy with it, you can then train your final configuration on your
+full data set.
+
+To do this, you first have to train models for your different
+configurations. Create two (or more) config files including the policies you want to
+compare, and then provide them to the train script to train your models:
+
+```bash
+rasa train core -c config_1.yml config_2.yml \
+  --out comparison_models --runs 3 --percentages 0 5 25 50 70 95
+```
+
+Similar to how the [NLU model was evaluated](#comparing-nlu-pipelines), the above
+command trains the dialogue model on multiple configurations and different amounts of training data.
+For each config file provided, Rasa will train dialogue models
+with 0, 5, 25, 50, 70 and 95% of your training stories excluded from the training
+data. This is repeated three times to ensure consistent results.
+
+Once this script has finished, you can pass multiple models to the test script
+to compare the models you just trained:
+
+```bash
+rasa test core -m comparison_models --stories stories_folder
+  --out comparison_results --evaluate-model-directory
+```
+
+This will evaluate each model on the stories in `stories_folder`
+(can be either training or test set) and plot some graphs
+to show you which policy performs best. Since the previous train command
+excluded some amount of training data to train each model,
+the above test command can measure how well your model predicts the held-out stories.
+To compare single policies, create config files containing only one policy each.
+
+:::note
+This training process can take a long time, so we'd suggest letting it run
+somewhere in the background where it can't be interrupted.
+
+:::
+
+### Testing Action Code
+
+The approach used to test your action code will depend on how it is
+implemented. For example, if you connect to external APIs, you should write integration tests to ensure
+that those APIs respond as expected to common inputs. However you test your action code, you should
+include these tests in your CI pipeline so that they run each time you make changes.
+
+If you have any questions or problems, please share them with us in the dedicated
+[testing section on our forum](https://forum.rasa.com/tags/testing)!
+
+## End-To-End Testing
+
+<RasaProLabel />
+
+<RasaProBanner />
+
+:::info New in 3.5
+
+You can now use [end-to-end testing](../production/testing-your-assistant.mdx#end-to-end-testing)
+to test your assistant as a whole, including dialogue management and custom actions.
+
+:::
diff --git a/docs/docs/building-classic-assistants/training-data-format.mdx b/docs/docs/building-classic-assistants/training-data-format.mdx
index 28546971dbbf..0183127aa49c 100644
--- a/docs/docs/building-classic-assistants/training-data-format.mdx
+++ b/docs/docs/building-classic-assistants/training-data-format.mdx
@@ -7,6 +7,10 @@ abstract: This page describes the different types of training data
   that go into a Rasa assistant and how this training data is structured.
 ---
 
+import RasaNLUBasedBanner from "@theme/RasaNLUBasedBanner";
+
+<RasaNLUBasedBanner />
+
 ## Overview
 
 Rasa uses [YAML](https://yaml.org/spec/1.2/spec.html) as
@@ -706,7 +710,7 @@ rasa test
 ```
 
 If you want to know more about testing head over to
-[Testing Your Assistant](../production/testing-your-assistant.mdx).
+[Testing Your Assistant](./testing-your-assistant.mdx).
 
 ## End-to-end Training
 
diff --git a/docs/docs/building-classic-assistants/tuning-your-model.mdx b/docs/docs/building-classic-assistants/tuning-your-model.mdx
index 0e5eb2aabd64..876783e6661d 100644
--- a/docs/docs/building-classic-assistants/tuning-your-model.mdx
+++ b/docs/docs/building-classic-assistants/tuning-your-model.mdx
@@ -6,6 +6,9 @@ abstract: Rasa will provide you with a suggested NLU config on initialization of
 ---
 
 import useBaseUrl from "@docusaurus/useBaseUrl";
+import RasaNLUBasedBanner from "@theme/RasaNLUBasedBanner";
+
+<RasaNLUBasedBanner />
 
 ## How to Choose a Pipeline
 
@@ -246,7 +249,7 @@ When using a multi-intent, the intent is featurized for machine learning policie
 ### Comparing Pipelines
 
 Rasa gives you the tools to compare the performance of multiple pipelines on your data directly.
-See [Comparing NLU Pipelines](../production/testing-your-assistant.mdx#comparing-nlu-pipelines) for more information.
+See [Comparing NLU Pipelines](./testing-your-assistant.mdx#comparing-nlu-pipelines) for more information.
 
 ## Choosing the Right Components
 
diff --git a/docs/docs/building-classic-assistants/unexpected-input.mdx b/docs/docs/building-classic-assistants/unexpected-input.mdx
index 3e82210f9ef2..9bd1cb0f1afd 100644
--- a/docs/docs/building-classic-assistants/unexpected-input.mdx
+++ b/docs/docs/building-classic-assistants/unexpected-input.mdx
@@ -7,6 +7,10 @@ abstract:
   will say unexpected things. This page is a guide on handling unexpected input.
 ---
 
+import RasaNLUBasedBanner from "@theme/RasaNLUBasedBanner";
+
+<RasaNLUBasedBanner />
+
 Unexpected input is a deviation from the [happy path](./glossary.mdx#happy--unhappy-paths)
 that you have defined. For example:
 
diff --git a/docs/docs/command-line-interface.mdx b/docs/docs/command-line-interface.mdx
index 0135f4f59d48..e7b7d6e51d98 100644
--- a/docs/docs/command-line-interface.mdx
+++ b/docs/docs/command-line-interface.mdx
@@ -398,8 +398,8 @@ rasa test nlu
 ```
 
 You can find more details on specific arguments for each testing type in
-[Evaluating an NLU Model](./production/testing-your-assistant.mdx#evaluating-an-nlu-model) and
-[Evaluating a Dialogue Management Model](./production/testing-your-assistant.mdx#evaluating-a-dialogue-model).
+[Evaluating an NLU Model](./building-classic-assistants/testing-your-assistant.mdx#evaluating-an-nlu-model) and
+[Evaluating a Dialogue Management Model](./building-classic-assistants/testing-your-assistant.mdx#evaluating-a-dialogue-model).
 
 The following arguments are available for `rasa test`:
 
diff --git a/docs/docs/production/setting-up-ci-cd.mdx b/docs/docs/production/setting-up-ci-cd.mdx
index 82856ba5ab5f..dc603796e70b 100644
--- a/docs/docs/production/setting-up-ci-cd.mdx
+++ b/docs/docs/production/setting-up-ci-cd.mdx
@@ -109,7 +109,7 @@ succeeded.
 
 ### Deploying Your Rasa Model
 
-If you ran [test stories](../production/testing-your-assistant.mdx) in your CI pipeline,
+If you ran [test stories](../building-classic-assistants/testing-your-assistant.mdx) in your CI pipeline,
 you'll already have a trained model. You can set up your CD pipeline to upload the trained model to your
 Rasa server if the CI results are satisfactory. For example, to upload a model to Rasa X/Enterprise:
 
diff --git a/docs/docs/production/testing-your-assistant.mdx b/docs/docs/production/testing-your-assistant.mdx
index 70d052472fe4..aa968273de7d 100644
--- a/docs/docs/production/testing-your-assistant.mdx
+++ b/docs/docs/production/testing-your-assistant.mdx
@@ -3,10 +3,7 @@ id: testing-your-assistant
 sidebar_label: Testing Your Assistant
 title: Testing Your Assistant
 abstract:
-  Rasa lets you validate and test dialogues end-to-end by running through
-  test stories. In addition, you can
-  also test the dialogue management and the message processing (NLU)
-  separately.
+  Rasa lets you validate and test dialogues end-to-end.
 ---
 
 import useBaseUrl from "@docusaurus/useBaseUrl";
@@ -15,7 +12,7 @@ import RasaProLabel from "@theme/RasaProLabel";
 
 import RasaProBanner from "@theme/RasaProBanner";
 
-## Validating Data and Stories
+## Validating Data
 
 Data validation verifies that no mistakes or major inconsistencies appear in your domain, NLU
 data, or story data. To validate your data, have your CI run this command:
@@ -35,511 +32,9 @@ If data validation results in errors, training a model can also fail or yield ba
 always good to run this check before training a model. By including the
 `--fail-on-warnings` flag, this step will fail on warnings indicating more minor issues.
 
-:::note
-Running `rasa data validate` does **not** test if your [rules](../building-classic-assistants/rules.mdx) are consistent with your stories.
-However, during training, the `RulePolicy` checks for conflicts between rules and stories. Any such conflict will abort training.
-:::
-
 To read more about the validator and all of the available options, see [the documentation for
 `rasa data validate`](../command-line-interface.mdx#rasa-data-validate).
 
-## Writing Test Stories
-
-Testing your trained model on test stories is the best way to have confidence in how your assistant
-will act in certain situations. Written in a modified story
-format, test stories allow you to provide entire conversations and test that, given certain
-user input, your model will behave in the expected manner. This is especially
-important as you start introducing more complicated stories from user
-conversations.
-
-Test stories are like
-the stories in your training data, but include the user message as well.
-
-Here are some examples:
-
-<Tabs values={[{"label": "Basics", "value": "basics"}, {"label": "Button Payload", "value": "buttons"}, {"label": "Custom Actions", "value": "customactions"}, {"label": "Forms Happy Path", "value": "formshappypath"}, {"label": "Forms Unhappy Path", "value": "formsunhappypath"}]} defaultValue="basics">
-  <TabItem value="basics">
-
-```yaml-rasa title="tests/test_stories.yml" {5,9,13}
-stories:
-- story: A basic story test
-  steps:
-  - user: |
-      hello
-    intent: greet
-  - action: utter_ask_howcanhelp
-  - user: |
-     show me [chinese]{"entity": "cuisine"} restaurants
-    intent: inform
-  - action: utter_ask_location
-  - user: |
-      in [Paris]{"entity": "location"}
-    intent: inform
-  - action: utter_ask_price
-```
-
-  </TabItem>
-  <TabItem value="buttons">
-
-```yaml-rasa title="tests/test_stories.yml" {8,11}
-stories:
-- story: A test where a user clicks on a button with payload
-  steps:
-  - user: |
-      hello
-    intent: greet
-  - action: utter_ask_howcanhelp
-  - user: /inform{{"cuisine":"chinese"}}
-    intent: inform
-  - action: utter_ask_location
-  - user: /inform{{"location":"Paris"}}
-    intent: inform
-  - action: utter_ask_price
-```
-
-  </TabItem>
-  <TabItem value="customactions">
-
-```yaml-rasa title="tests/test_stories.yml" {5,12}
-stories:
-- story: A test where a custom action returns events
-  steps:
-  - user: |
-      hey
-    intent: greet
-  - action: my_custom_action
-  - slot_was_set:
-    - my_slot: "value added by custom action"
-  - action: utter_ask_age
-  - user: |
-      thanks
-    intent: thankyou
-  - action: utter_no_worries
-```
-
-  </TabItem>
-  <TabItem value="formshappypath">
-
-```yaml-rasa title="tests/test_stories.yml" {5,9,14,20}
-stories:
-- story: A test story with a form
-  steps:
-  - user: |
-      hi
-    intent: greet
-  - action: utter_greet
-  - user: |
-      im looking for a restaurant
-    intent: request_restaurant
-  - action: restaurant_form
-  - active_loop: restaurant_form
-  - user: |
-      [afghan](cuisine) food
-    intent: inform
-  - action: restaurant_form
-  - active_loop: null
-  - action: utter_slots_values
-  - user: |
-      thanks
-    intent: thankyou
-  - action: utter_no_worries
-```
-
-  </TabItem>
-  <TabItem value="formsunhappypath">
-
-```yaml-rasa title="tests/test_stories.yml" {5,9,14,21}
-stories:
-- story: A test story with unexpected input during a form
-  steps:
-  - user: |
-      hi
-    intent: greet
-  - action: utter_greet
-  - user: |
-      im looking for a restaurant
-    intent: request_restaurant
-  - action: restaurant_form
-  - active_loop: restaurant_form
-  - user: |
-      How's the weather?
-    intent: chitchat
-  - action: utter_chitchat
-  - action: restaurant_form
-  - active_loop: null
-  - action: utter_slots_values
-  - user: |
-      thanks
-    intent: thankyou
-  - action: utter_no_worries
-```
-
-  </TabItem>
-  <TabItem value="entities">
-
-```yaml-rasa title="tests/test_stories.yml" {5,9,13}
-stories:
-- story: A basic test story with multiple entities for a single token
-  steps:
-  - user: |
-      hello
-    intent: greet
-  - action: utter_ask_howcanhelp
-  - user: |
-     show me [chinese]{"entity": "cuisine"} restaurants
-    intent: inform
-  - action: utter_ask_location
-  - user: |
-      in [Paris][{"entity": "location"}, {"entity": "city"}]
-    intent: inform
-  - action: utter_ask_price
-```
-
-  </TabItem>
-</Tabs>
-
-By default, the command will run tests on stories from any files with names starting with `test_`. You can also provide
-a specific test stories file or directory with the `--stories` argument.
-You can test your assistant against them by running:
-
-```bash
-rasa test
-```
-
-Conversation testing is only as thorough and accurate as the test
-cases you include, so you should continue to grow your set of test stories
-as you make improvements to your assistant. A good rule of thumb to follow is that you should aim for your test stories
-to be representative of the true distribution of real conversations.
-
-See the [CLI documentation on `rasa test`](../command-line-interface.mdx#rasa-test) for
-more configuration options.
-
-:::caution Testing Custom Actions
-[Custom Actions](../concepts/custom-actions.mdx) are not executed as part of test stories. If your custom
-actions append any events to the conversation, this has to be reflected in your test story
-(e.g. by adding `slot_was_set` events to your test story).
-
-To test the code of your custom actions, you should write unit tests
-for them and include these tests in your [CI/CD pipeline](./setting-up-ci-cd.mdx).
-
-:::
-
-## Evaluating an NLU Model
-
-In addition to testing stories, you can also test the natural language understanding (NLU) model separately.
-Once your assistant is deployed in the real world, it will be processing messages that it hasn't seen
-in the training data. To simulate this, you should always set aside some part of your data for testing.
-You can either:
-
-1. [use a held out test set](#using-a-held-out-test-set) by shuffling and splitting your NLU data
-
-2. [use cross-validation](#using-cross-validation), which automatically creates
-   multiple train/test splits
-
-### Using a Held-Out Test Set
-
-If you use the train-test
-set approach, it is best to [shuffle and split your data](../command-line-interface.mdx#rasa-data-split)
-using `rasa data split` every time you evaluate your model, as
-opposed to using a static NLU test set, which can easily become outdated.
-
-You can split your NLU data into train and test sets using:
-
-```bash
-rasa data split nlu
-```
-
-Next, you can see how well your trained NLU model predicts the
-data from the test set you generated, using:
-
-```bash {2}
-rasa test nlu
-    --nlu train_test_split/test_data.yml
-```
-
-### Using Cross-Validation
-
-If you've made significant changes to your NLU training data (e.g.
-splitting an intent into two intents or adding a lot of training examples), you should run a
-full NLU evaluation using cross-validation. Cross-validation automatically creates
-multiple train/test splits and averages the results of evaluations on each train/test split.
-This means all your data is evaluated during cross-validation, making cross-validation the most
-thorough way to automatically test your NLU model.
-
-To run NLU testing in cross-validation mode run:
-
-```bash {3}
-rasa test nlu
-    --nlu data/nlu
-    --cross-validation
-```
-
-You can specify the number of test/train splits used with the `-f/--folds` flag:
-
-```bash {4}
-rasa test nlu
-    --nlu data/nlu
-    --cross-validation
-    --folds 5
-```
-
-Note that during cross-validation, the NLU model will be trained for each fold,
-so cross-validation with a large data set and a high number of folds can be time-consuming.
-On a small data set, a high number of folds can result in too few examples per intent being available for each test split.
-
-On the other hand, if you specify a low number of folds, your data will be split into much larger chunks,
-and there will be proportionally less data to train on for each fold.
-
-Choose a number of folds that balances both considerations for your dataset size.
-
-:::tip hyperparameter tuning
-To further improve your model check out this
-[tutorial on hyperparameter tuning](https://blog.rasa.com/rasa-nlu-in-depth-part-3-hyperparameters/).
-:::
-
-### Comparing NLU Pipelines
-
-To get the most out of your training data, you should train and evaluate your model on different pipelines
-and different amounts of training data.
-
-To do so, pass multiple configuration files to the `rasa test` command:
-
-```bash {2}
-rasa test nlu --nlu data/nlu.yml
-   --config config_1.yml config_2.yml
-```
-
-This performs several steps:
-
-1. Create a global 80% train / 20% test split from `data/nlu.yml`.
-2. Exclude a certain percentage of data from the global train split.
-3. Train models for each configuration on remaining training data.
-4. Evaluate each model on the global test split.
-
-The above process is repeated with different percentages of training data in step 2
-to give you an idea of how each pipeline will behave if you increase the amount of training data.
-Since training is not completely deterministic, the whole process is repeated
-three times for each configuration specified.
-
-A graph with the mean and standard deviations of
-[f1-scores](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html)
-across all runs is plotted.
-The f1-score graph, along with all train/test sets, the trained models, classification and error reports,
-will be saved into a folder called `nlu_comparison_results`.
-
-Inspecting the f1-score graph can help you understand if you have enough data for your NLU model.
-If the graph shows that f1-score is still improving when all of the training data is used,
-it may improve further with more data. But if f1-score has plateaued when all training data is used,
-adding more data may not help.
-
-If you want to change the number of runs or exclusion percentages, you can:
-
-```bash {3}
-rasa test nlu --nlu data/nlu.yml
-  --config config_1.yml config_2.yml
-  --runs 4 --percentages 0 25 50 70 90
-```
-
-### Interpreting the Output
-
-#### Intent Classifiers
-
-The `rasa test` script will produce a report (`intent_report.json`), confusion matrix (`intent_confusion_matrix.png`)
-and confidence histogram (`intent_histogram.png`) for your intent classification model.
-
-The report logs [precision](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html),
-[recall](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.recall_score.html) and
-[f1-score](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html) for each intent,
-as well as providing an overall average. You can save these reports as JSON files using the `--report` argument.
-
-The confusion matrix shows which intents are mistaken for others.
-Any samples which have been incorrectly predicted are logged and saved to a file called `errors.json` for easier debugging.
-
-<div align="center">
-  <img
-    alt="image"
-    src={useBaseUrl("/img/intent_confusion_matrix_example.png")}
-    width="70%"
-  />
-</div>
-
-The histogram allows you to visualize the confidence for all predictions,
-with the correct and incorrect predictions being displayed by blue and red bars respectively.
-Improving the quality of your training data will move the blue histogram bars up the plot and the
-red histogram bars down the plot. It should also help in reducing the number of red histogram bars itself.
-
-<div align="center">
-  <img
-    alt="image"
-    src={useBaseUrl("/img/intent_histogram_example.png")}
-    width="70%"
-  />
-</div>
-
-#### Response Selectors
-
-`rasa test` evaluates response selectors in the same way that it evaluates intent classifiers, producing a
-report (`response_selection_report.json`), confusion matrix (`response_selection_confusion_matrix.png`),
-confidence histogram (`response_selection_histogram.png`) and errors (`response_selection_errors.json`).
-If your pipeline includes multiple response selectors, they are evaluated in a single report.
-
-The report logs precision, recall and f1 measure for
-each sub-intent of a [retrieval intent](../building-classic-assistants/glossary.mdx#retrieval-intent) and provides an overall average.
-You can save these reports as JSON files using the `--report` argument.
-
-#### Entity Extraction
-
-`rasa test` reports recall, precision, and f1-score for each entity type that
-your trainable entity extractors are trained to recognize.
-
-Only trainable entity extractors, such as the `DIETClassifier` and `CRFEntityExtractor` are
-evaluated by `rasa test`. Pretrained extractors like the `DucklingHTTPExtractor` are not evaluated.
-
-If you have multiple entity extractors in your pipeline, or use some custom extractors,
-multiple entities might be associated with the same token. In this case,
-you can use a list notation in the test files, such as
-
-```yaml
-stories:
-  - story: A basic test story with multiple entities for a single token
-    steps:
-      - user: |
-          I like [ice cream][{\"entity\": \"food\"}, {\"entity\": \"desert\"}]
-        intent: inform
-      # ...
-```
-
-:::caution incorrect entity annotations
-If any of your entities are incorrectly annotated, your evaluation may fail. One common problem
-is that an entity cannot stop or start inside a token.
-For example, if you have an example for a `name` entity
-like `[Brian](name)'s house`, this is only valid if your tokenizer splits `Brian's` into
-multiple tokens.
-
-:::
-
-#### Entity Scoring
-
-To evaluate entity extraction we apply a simple tag-based approach. We don't consider
-[BILOU tags](../building-classic-assistants/nlu-training-data.mdx#bilou-entity-tagging) exactly, but only the
-entity type tags on a per token basis. For location entity like “near Alexanderplatz” we
-expect the labels `LOC LOC` instead of the BILOU-based `B-LOC L-LOC`.
-
-Our approach is more lenient when it comes to evaluation, as it rewards
-partial extraction and does not penalize the splitting of entities.
-For example, given the aforementioned entity “near Alexanderplatz” and a system that extracts
-“Alexanderplatz”, our approach rewards the extraction of “Alexanderplatz” and penalizes the missed out word “near”.
-
-The BILOU-based approach, however, would label this as a complete failure since it expects Alexanderplatz
-to be labeled as a last token in an entity (`L-LOC`) instead of a single token entity (`U-LOC`). Note also that
-a split extraction of “near” and “Alexanderplatz” would get full scores on our approach and zero on the
-BILOU-based one.
-
-Here's a comparison between the two scoring mechanisms for the phrase “near Alexanderplatz tonight”:
-
-| extracted                                           | Simple tags (score) | BILOU tags (score)     |
-| --------------------------------------------------- | ------------------- | ---------------------- |
-| `[near Alexanderplatz](loc) [tonight](time)`        | loc loc time (3)    | B-loc L-loc U-time (3) |
-| `[near](loc) [Alexanderplatz](loc) [tonight](time)` | loc loc time (3)    | U-loc U-loc U-time (1) |
-| `near [Alexanderplatz](loc) [tonight](time)`        | O loc time (2)      | O U-loc U-time (1)     |
-| `[near](loc) Alexanderplatz [tonight](time)`        | loc O time (2)      | U-loc O U-time (1)     |
-| `[near Alexanderplatz tonight](loc)`                | loc loc loc (2)     | B-loc I-loc L-loc (1)  |
-
-## Evaluating a Dialogue Model
-
-You can evaluate your trained dialogue model on a set of test stories
-by using the test script:
-
-```bash
-rasa test core --stories test_stories.yml --out results
-```
-
-This will print any failed stories to `results/failed_test_stories.yml`.
-A story fails if at least one of the actions was predicted incorrectly.
-
-The test script will also save a confusion matrix to a file called
-`results/story_confmat.pdf`. For each action in your domain, the confusion
-matrix shows how often the action was correctly predicted and how often an
-incorrect action was predicted instead.
-
-### Interpreting the generated warnings
-
-The test script will also generate a warnings file called `results/stories_with_warnings.yml`.
-This file contains all test stories for which [`action_unlikely_intent`](../concepts/default-actions.mdx#action_unlikely_intent)
-was predicted at any conversation turn but all actions from the original story were predicted correctly.
-However, if a test story originally included an `action_unlikely_intent`, for example to ensure [a rule is designed to
-trigger the conversation path after an `action_unlikely_intent`](../concepts/default-actions.mdx#customization-1) but the ensemble of
-policies failed to do so, then the corresponding story will end up in `results/failed_test_stories.yml` as
-a failed story.
-
-The stories are sorted by the severity of `action_unlikely_intent`'s prediction.
-This severity is calculated by [`UnexpecTEDIntentPolicy`](../building-classic-assistants/policies.mdx#unexpected-intent-policy) itself at prediction time.
-The higher the severity, the more unlikely is the intent and hence reviewing that particular
-conversation path becomes more critical.
-
-Note, that `action_unlikely_intent` is predicted by
-`UnexpecTEDIntentPolicy` which employs a machine learning based model
-under the hood and hence can result in false warnings as well. You can choose to ignore such warnings
-if the conversation paths in these stories are already present in the training stories.
-
-### Comparing Policy Configurations
-
-To choose a configuration for your dialogue model, or to choose hyperparameters for a
-specific policy, you want to measure how well your dialogue model will generalize
-to conversations it hasn't seen before. Especially in the beginning
-of a project, when you don't have a lot of real conversations to train
-your bot on, you may not want to exclude some to use as a test set.
-
-Rasa has some scripts to help you choose and fine-tune your policy configuration.
-Once you are happy with it, you can then train your final configuration on your
-full data set.
-
-To do this, you first have to train models for your different
-configurations. Create two (or more) config files including the policies you want to
-compare, and then provide them to the train script to train your models:
-
-```bash
-rasa train core -c config_1.yml config_2.yml \
-  --out comparison_models --runs 3 --percentages 0 5 25 50 70 95
-```
-
-Similar to how the [NLU model was evaluated](../production/testing-your-assistant.mdx#comparing-nlu-pipelines), the above
-command trains the dialogue model on multiple configurations and different amounts of training data.
-For each config file provided, Rasa will train dialogue models
-with 0, 5, 25, 50, 70 and 95% of your training stories excluded from the training
-data. This is repeated three times to ensure consistent results.
-
-Once this script has finished, you can pass multiple models to the test script
-to compare the models you just trained:
-
-```bash
-rasa test core -m comparison_models --stories stories_folder
-  --out comparison_results --evaluate-model-directory
-```
-
-This will evaluate each model on the stories in `stories_folder`
-(can be either training or test set) and plot some graphs
-to show you which policy performs best. Since the previous train command
-excluded some amount of training data to train each model,
-the above test command can measure how well your model predicts the held-out stories.
-To compare single policies, create config files containing only one policy each.
-
-:::note
-This training process can take a long time, so we'd suggest letting it run
-somewhere in the background where it can't be interrupted.
-
-:::
-
-### Testing Action Code
-
-The approach used to test your action code will depend on how it is
-implemented. For example, if you connect to external APIs, you should write integration tests to ensure
-that those APIs respond as expected to common inputs. However you test your action code, you should
-include these tests in your CI pipeline so that they run each time you make changes.
-
-If you have any questions or problems, please share them with us in the dedicated
-[testing section on our forum](https://forum.rasa.com/tags/testing)!
-
 ## End-To-End Testing
 
 <RasaProLabel />
diff --git a/docs/sidebars.js b/docs/sidebars.js
index e7b1831130ea..1a3917fa5779 100644
--- a/docs/sidebars.js
+++ b/docs/sidebars.js
@@ -26,7 +26,7 @@ module.exports = {
           ],
         },
         "tutorial",
-        "command-line-interface",        
+        "command-line-interface",
       ],
     },
     {
@@ -36,7 +36,7 @@ module.exports = {
       items: [
         // TODO: ENG-537
         "concepts/flows",
-        "concepts/dialogue-understanding",        
+        "concepts/dialogue-understanding",
         "concepts/domain",
         "concepts/unhappy-paths",
         {
@@ -54,9 +54,9 @@ module.exports = {
           label: "Responses",
           items: [
             "concepts/responses",
-            "concepts/contextual-response-rephraser",            
+            "concepts/contextual-response-rephraser",
           ],
-        },        
+        },
         {
           type: "category",
           label: "Components",
@@ -105,7 +105,8 @@ module.exports = {
             "building-classic-assistants/training-data-format",
             "building-classic-assistants/nlu-training-data",
             "building-classic-assistants/tuning-your-model",
-            "building-classic-assistants/nlu-only",         
+            "building-classic-assistants/nlu-only",
+            "building-classic-assistants/testing-your-assistant",
           ],
         },
       ],
@@ -246,7 +247,7 @@ module.exports = {
           ],
         },
         "operating/tracing",
-        "operating/spaces",        
+        "operating/spaces",
       ],
     },
     {
@@ -260,7 +261,7 @@ module.exports = {
       label: "Reference",
       collapsed: true,
       items: [
-        "building-classic-assistants/glossary",        
+        "building-classic-assistants/glossary",
         "telemetry/telemetry",
         "telemetry/reference",
         require("./docs/reference/sidebar.json"),
@@ -283,5 +284,5 @@ module.exports = {
         },
       ],
     },
-  ]
+  ],
 };
diff --git a/docs/themes/theme-custom/theme/RasaDiscoveryBanner/index.jsx b/docs/themes/theme-custom/theme/RasaDiscoveryBanner/index.jsx
index 4630c5cbe3fc..a97d5f6f74ad 100644
--- a/docs/themes/theme-custom/theme/RasaDiscoveryBanner/index.jsx
+++ b/docs/themes/theme-custom/theme/RasaDiscoveryBanner/index.jsx
@@ -1,7 +1,5 @@
 import * as React from 'react';
-import clsx from 'clsx';
 
-import styles from './styles.module.css';
 
 function RasaDiscoveryBanner({isLoading, ...props}) {
   return (
@@ -19,7 +17,7 @@ function RasaDiscoveryBanner({isLoading, ...props}) {
         </div>
       <div class="mdx-box admonition-content">
         <p>
-          Discovery features are <strong>highly unstable</strong>. We introduce 
+          Discovery features are <strong>highly unstable</strong>. We introduce
           them for product exploration.
 
           These features will change and should only be used for testing purposes.
diff --git a/docs/themes/theme-custom/theme/RasaDiscoveryBanner/styles.module.css b/docs/themes/theme-custom/theme/RasaDiscoveryBanner/styles.module.css
deleted file mode 100644
index 4a6e8636517a..000000000000
--- a/docs/themes/theme-custom/theme/RasaDiscoveryBanner/styles.module.css
+++ /dev/null
@@ -1,15 +0,0 @@
-.label {
-    background-color: #F6D261;
-    border: 1px solid transparent;
-    border-radius: 8px;
-    padding: 2px 12px;
-    font-size: 15px !important;
-    font-weight: 600;
-
-    display: inline-block;
-}
-
-.label[disabled] {
-    background-color: var(--ifm-color-gray-500);
-    cursor: default;
-}
diff --git a/docs/themes/theme-custom/theme/RasaLabsBanner/styles.module.css b/docs/themes/theme-custom/theme/RasaLabsBanner/styles.module.css
index eafbc6bd315e..fed55d974a30 100644
--- a/docs/themes/theme-custom/theme/RasaLabsBanner/styles.module.css
+++ b/docs/themes/theme-custom/theme/RasaLabsBanner/styles.module.css
@@ -1,19 +1,3 @@
-.label {
-    background-color: #F6D261;
-    border: 1px solid transparent;
-    border-radius: 8px;
-    padding: 2px 12px;
-    font-size: 15px !important;
-    font-weight: 600;
-
-    display: inline-block;
-}
-
-.label[disabled] {
-    background-color: var(--ifm-color-gray-500);
-    cursor: default;
-}
-
 .titleExtension {
     text-transform: none !important;
 }
diff --git a/docs/themes/theme-custom/theme/RasaNLUBasedBanner/index.jsx b/docs/themes/theme-custom/theme/RasaNLUBasedBanner/index.jsx
new file mode 100644
index 000000000000..a12defd5d87a
--- /dev/null
+++ b/docs/themes/theme-custom/theme/RasaNLUBasedBanner/index.jsx
@@ -0,0 +1,30 @@
+import * as React from 'react';
+import useBaseUrl from "@docusaurus/useBaseUrl";
+
+function RasaNLUBasedBanner() {
+  return (
+    <>
+      <div class="mdx-box admonition admonition-tip alert alert--info">
+        <div class="mdx-box admonition-heading">
+          <h5>
+            <span class="admonition-icon">
+              <svg xmlns="http://www.w3.org/2000/svg" width="12" height="16" viewBox="0 0 12 16">
+                <path fill-rule="evenodd" d="M6.5 0C3.48 0 1 2.19 1 5c0 .92.55 2.25 1 3 1.34 2.25 1.78 2.78 2 4v1h5v-1c.22-1.22.66-1.75 2-4 .45-.75 1-2.08 1-3 0-2.81-2.48-5-5.5-5zm3.64 7.48c-.25.44-.47.8-.67 1.11-.86 1.41-1.25 2.06-1.45 3.23-.02.05-.02.11-.02.17H5c0-.06 0-.13-.02-.17-.2-1.17-.59-1.83-1.45-3.23-.2-.31-.42-.67-.67-1.11C2.44 6.78 2 5.65 2 5c0-2.2 2.02-4 4.5-4 1.22 0 2.36.42 3.22 1.19C10.55 2.94 11 3.94 11 5c0 .66-.44 1.78-.86 2.48zM4 14h5c-.23 1.14-1.3 2-2.5 2s-2.27-.86-2.5-2z">
+                </path>
+              </svg>
+            </span>NLU-based assistants
+          </h5>
+        </div>
+      <div class="mdx-box admonition-content">
+        <p>
+          This page refers to building NLU-based assistants.
+          If you are working with <a href={useBaseUrl("/calm")}>Conversational AI with Language Models (CALM)</a>,
+          this content may not apply to you.
+        </p>
+      </div>
+    </div>
+  </>
+  )
+}
+
+export default RasaNLUBasedBanner;
diff --git a/docs/themes/theme-custom/theme/RasaProBanner/index.jsx b/docs/themes/theme-custom/theme/RasaProBanner/index.jsx
index ef975c61942f..ae87d08ac5e3 100644
--- a/docs/themes/theme-custom/theme/RasaProBanner/index.jsx
+++ b/docs/themes/theme-custom/theme/RasaProBanner/index.jsx
@@ -1,7 +1,4 @@
 import * as React from 'react';
-import clsx from 'clsx';
-
-import styles from './styles.module.css';
 
 function RasaProBanner({isLoading, ...props}) {
   return (
diff --git a/docs/themes/theme-custom/theme/RasaProBanner/styles.module.css b/docs/themes/theme-custom/theme/RasaProBanner/styles.module.css
deleted file mode 100644
index 55cfcd71e39b..000000000000
--- a/docs/themes/theme-custom/theme/RasaProBanner/styles.module.css
+++ /dev/null
@@ -1,18 +0,0 @@
-
-
-.label {
-    background-color:#F6D261;
-    border: 1px solid transparent;
-    border-radius: 8px;
-    padding: 2px 12px;
-    font-size: 15px !important;
-    font-weight: 600;
-
-    display: inline-block;
-}
-
-.label[disabled] {
-    background-color: var(--ifm-color-gray-500);
-    cursor: default;
-}
-