update docs

FlagOpen · Dec 5, 2024 · bc09ef5 · bc09ef5
1 parent 2bdd0f0
commit bc09ef5
Show file tree

Hide file tree

Showing 14 changed files with 193 additions and 53 deletions.
diff --git a/Tutorials/1_Embedding/1.2.1_BGE_Series.ipynb b/Tutorials/1_Embedding/1.2.1_BGE_Series.ipynb
@@ -55,7 +55,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 3,
    "id": "a2376217",
    "metadata": {},
    "outputs": [],
@@ -123,35 +123,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
    "id": "89e07751",
    "metadata": {},
    "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/root/anaconda3/envs/dev/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
-      "  from .autonotebook import tqdm as notebook_tqdm\n",
-      "pre tokenize: 100%|██████████| 1/1 [00:00<00:00, 93.88it/s]\n",
-      "/root/anaconda3/envs/dev/lib/python3.12/site-packages/_distutils_hack/__init__.py:54: UserWarning: Reliance on distutils from stdlib is deprecated. Users must rely on setuptools to provide the distutils module. Avoid importing distutils or import setuptools first, and avoid setting SETUPTOOLS_USE_DISTUTILS=stdlib. Register concerns at https://github.com/pypa/setuptools/issues/new?template=distutils-deprecation.yml\n",
-      "  warnings.warn(\n",
-      "pre tokenize: 100%|██████████| 1/1 [00:00<00:00, 2418.86it/s]"
-     ]
-    },
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[[0.8486 0.7944]\n",
-      " [0.7607 0.8545]]\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "\n"
+      "[[0.84864    0.7946737 ]\n",
+      " [0.760097   0.85449743]]\n"
      ]
     }
    ],
@@ -162,7 +143,7 @@
     "model = FlagModel(\n",
     "    'BAAI/bge-base-en',\n",
     "    query_instruction_for_retrieval=\"Represent this sentence for searching relevant passages:\",\n",
-    "    query_instruction_format='{}{}'\n",
+    "    query_instruction_format='{}{}',\n",
     ")\n",
     "\n",
     "queries = [\"query 1\", \"query 2\"]\n",
@@ -562,7 +543,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": null,
    "id": "ffb586c6",
    "metadata": {},
    "outputs": [
@@ -588,8 +569,6 @@
     "from FlagEmbedding import FlagICLModel\n",
     "import os\n",
     "\n",
-    "# os.environ['HF_ENDPOINT']='https://hf-mirror.com'\n",
-    "\n",
     "model = FlagICLModel('BAAI/bge-en-icl', \n",
     "                     examples_for_task=examples,  # set `examples_for_task=None` to use model without examples\n",
     "                    #  examples_instruction_format=\"<instruct>{}\\n<query>{}\\n<response>{}\" # specify the format to use examples_for_task\n",

diff --git a/docs/source/FAQ/index.rst b/docs/source/FAQ/index.rst
@@ -1,2 +1,43 @@
 FAQ
-===
+===
+
+Below are some commonly asked questions.
+
+.. tip::
+
+    For more questions, search issues on GitHub or join our community!
+
+
+.. dropdown:: When does the query instruction need to be used?
+
+    For a retrieval task that uses short queries to find long related documents, it is recommended to add instructions for these short queries. 
+    The best method to decide whether to add instructions for queries is choosing the setting that achieves better performance on your task. 
+    In all cases, the documents/passages do not need to add the instruction.
+
+.. dropdown:: Why it takes quite long to just encode 1 sentence?
+
+    Note that if you have multiple CUDA GPUs, FlagEmbedding will automatically use all of them. 
+    Then the time used to start the multi-process will cost way longer than the actual encoding.
+    Try to just use CPU or just single GPU for simple tasks.
+
+.. dropdown:: The embedding results are different for CPU and GPU?
+
+    The encode function will use FP16 by default if GPU is available, which leads to different precision. 
+    Set :code:`fp16=False` to get full precision.
+
+.. dropdown:: How many languages do the multi-lingual models support?
+
+    The training datasets cover up to 170+ languages. 
+    But note that due to the unbalanced distribution of languages, the performances will be different.
+    Please further test refer to the real application scenario.
+
+.. dropdown:: How does the different retrieval method works in bge-m3?
+
+    - Dense retrieval: map the text into a single embedding, e.g., `DPR <https://arxiv.org/abs/2004.04906>`_, `BGE-v1.5 <../bge/bge_v1_v1.5>`_
+    - Sparse retrieval (lexical matching): a vector of size equal to the vocabulary, with the majority of positions set to zero, calculating a weight only for tokens present in the text. 
+    e.g., BM25, `unicoil <https://arxiv.org/pdf/2106.14807>`_, and `splade <https://arxiv.org/abs/2107.05720>`_
+    - Multi-vector retrieval: use multiple vectors to represent a text, e.g., `ColBERT <https://arxiv.org/abs/2004.12832>`_.
+
+.. dropdown:: Recommended vector database?
+
+    Generally you can use any vector database (open-sourced, commercial). We use `Faiss <https://github.com/facebookresearch/faiss>`_ by default in our evaluation pipeline and tutorials.
diff --git a/docs/source/Introduction/index.rst b/docs/source/Introduction/index.rst
@@ -7,13 +7,14 @@ BGE builds one-stop retrieval toolkit for search and RAG. We provide inference,
    :width: 700
    :align: center
 
-   BGE embedder and reranker in an RAG pipeline.
+   BGE embedder and reranker in an RAG pipelin. `Source <https://safjan.com/images/retrieval_augmented_generation/RAG.png>`_
 
 Quickly get started with:
 
 .. toctree::
    :maxdepth: 1
 
    installation
+   quick_start
    concept
-   quick_start
+   retrieval_demo
diff --git a/docs/source/Introduction/quick_start.rst b/docs/source/Introduction/quick_start.rst
@@ -0,0 +1,35 @@
+Quick Start
+===========
+
+First, load one of the BGE embedding model:
+
+.. code:: python
+
+    from FlagEmbedding import FlagAutoModel
+
+    model = FlagAutoModel.from_finetuned('BAAI/bge-base-en-v1.5',
+                                        query_instruction_for_retrieval="Represent this sentence for searching relevant passages:",
+                                        use_fp16=True)
+
+.. tip::
+
+    If there's difficulty connecting to Hugging Face, you can use the `HF mirror <https://hf-mirror.com/>`_ instead.
+
+    .. code:: bash
+
+        export HF_ENDPOINT=https://hf-mirror.com
+
+Then, feed some sentences to the model and get their embeddings:
+
+.. code:: python
+    sentences_1 = ["I love NLP", "I love machine learning"]
+    sentences_2 = ["I love BGE", "I love text retrieval"]
+    embeddings_1 = model.encode(sentences_1)
+    embeddings_2 = model.encode(sentences_2)
+
+Once we get the embeddings, we can compute similarity by inner product:
+
+.. code:: python
+
+    similarity = embeddings_1 @ embeddings_2.T
+    print(similarity)
diff --git a/docs/source/Introduction/quick_start.ipynb → .../source/Introduction/retrieval_demo.ipynb b/docs/source/Introduction/quick_start.ipynb → .../source/Introduction/retrieval_demo.ipynb
@@ -4,7 +4,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "# Quick Start"
+    "# Retrieval Demo"
    ]
   },
   {

diff --git a/docs/source/_static/img/BGE_WeChat_Group.png b/docs/source/_static/img/BGE_WeChat_Group.png
diff --git a/docs/source/bge/bge_icl.rst b/docs/source/bge/bge_icl.rst
@@ -1,2 +1,51 @@
-BGE-en-icl
-==========
+BGE-EN-ICL
+==========
+
+BGE-EN-ICL is the new SoTA embedding model in BGE series with capabilities:
+- In-context learning ability: By providing few-shot examples in the query, it can significantly enhance the model's ability to handle new tasks.
+- Outstanding performance: The model has achieved state-of-the-art (SOTA) performance on MTEB and AIR-Bench.
+
++-------------------------------------------------------------------+-----------------+------------+--------------+----------------------------------------------------------------------------------------------------+
+|                                  Model                            |    Language     | Parameters |  Model Size  |                                            Description                                             |
++===================================================================+=================+============+==============+====================================================================================================+
+| `BAAI/bge-en-icl <https://huggingface.co/BAAI/bge-en-icl>`_       |     English     |    7.1B    |    28.5 GB   | In-context learning capabilities, fully leverage the model's potential based on a few shot examples|
++-------------------------------------------------------------------+-----------------+------------+--------------+----------------------------------------------------------------------------------------------------+
+
+
+
+Usage
+-----
+
+.. code:: python
+
+    from FlagEmbedding import FlagICLModel
+
+    documents = [
+        "As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or training for a marathon. Check out the chart below to see how much protein you should be eating each day.",
+        "Definition of summit for English Language Learners. : 1  the highest point of a mountain : the top of a mountain. : 2  the highest level. : 3  a meeting or series of meetings between the leaders of two or more governments."
+    ]
+
+    examples = [
+        {
+            'instruct': 'Given a web search query, retrieve relevant passages that answer the query.',
+            'query': 'what is a virtual interface',
+            'response': "A virtual interface is a software-defined abstraction that mimics the behavior and characteristics of a physical network interface. It allows multiple logical network connections to share the same physical network interface, enabling efficient utilization of network resources. Virtual interfaces are commonly used in virtualization technologies such as virtual machines and containers to provide network connectivity without requiring dedicated hardware. They facilitate flexible network configurations and help in isolating network traffic for security and management purposes."
+        },
+        {
+            'instruct': 'Given a web search query, retrieve relevant passages that answer the query.',
+            'query': 'causes of back pain in female for a week',
+            'response': "Back pain in females lasting a week can stem from various factors. Common causes include muscle strain due to lifting heavy objects or improper posture, spinal issues like herniated discs or osteoporosis, menstrual cramps causing referred pain, urinary tract infections, or pelvic inflammatory disease. Pregnancy-related changes can also contribute. Stress and lack of physical activity may exacerbate symptoms. Proper diagnosis by a healthcare professional is crucial for effective treatment and management."
+        }
+    ]
+
+    queries = ["how much protein should a female eat", "summit define"]
+
+    model = FlagICLModel('BAAI/bge-en-icl', 
+                         examples_for_task=examples,  # set `examples_for_task=None` to use model without examples
+                         examples_instruction_format="<instruct>{}\n<query>{}\n<response>{}") # specify the format to use examples_for_task
+
+    embeddings_1 = model.encode_queries(queries)
+    embeddings_2 = model.encode_corpus(documents)
+    similarity = embeddings_1 @ embeddings_2.T
+
+    print(similarity)
diff --git a/docs/source/bge/bge_m3.rst b/docs/source/bge/bge_m3.rst
@@ -114,4 +114,10 @@ Usage
     sentences_1 = ["What is BGE M3?", "Defination of BM25"]
 
     output = model.encode(sentences_1, return_dense=True, return_sparse=True, return_colbert_vecs=True)
-    dense, sparse, multiv = output['dense_vecs'], output['lexical_weights'], output['colbert_vecs']
+    dense, sparse, multiv = output['dense_vecs'], output['lexical_weights'], output['colbert_vecs']
+
+Useful Links:
+
+`API <../API/inference/embedder/encoder_only/M3Embedder>`_
+`Tutorial <>`_
+`Example <https://github.com/FlagOpen/FlagEmbedding/tree/master/examples/inference/embedder/encoder_only>`_
diff --git a/docs/source/bge/bge_reranker.rst b/docs/source/bge/bge_reranker.rst
@@ -1,2 +1,2 @@
 BGE-Reranker
-============
+============
diff --git a/docs/source/bge/bge_v1_v1.5.rst b/docs/source/bge/bge_v1_v1.5.rst
@@ -51,7 +51,7 @@ were released in Sep 2023. They are still the most popular embedding models that
 Usage
 -----
 
-To use BGE v1 or v1.5 model for inference, load model through ``
+To use BGE v1 or v1.5 model for inference, load model through
 
 .. code:: python
 
@@ -64,14 +64,35 @@ To use BGE v1 or v1.5 model for inference, load model through ``
 
 .. tip::
 
-    For simple tasks that only encode a few sentences like above, it's faster to use single GPU comparing to multi-GPUs:
+    For simple tasks that only encode a few sentences like above, it's faster to use CPU or a single GPU instead of multi-GPUs
 
-    .. code:: python
+To use CPU:
 
-        import os
-        os.environ['CUDA_VISIBLE_DEVICES'] = '0'
-    or 
+.. code:: python
+
+    # make no GPU visible
+    import os
+    os.environ['CUDA_VISIBLE_DEVICES'] = ''
+    
+    # or claim the devices during initialize the model
+    model = FlagModel('BAAI/bge-base-en-v1.5', devices='cpu')
+
+To use a single GPU:
+
+.. code:: python
+
+    # select one sigle card to be visible
+    import os
+    os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+    
+    # or claim the devices during initialize the model
+    model = FlagModel('BAAI/bge-base-en-v1.5', devices=0)
+
+|
+Useful Links:
+
+`API <../API/inference/embedder/encoder_only/BaseEmbedder>`_
 
-    .. code:: python
+`Tutorial <https://github.com/FlagOpen/FlagEmbedding/blob/master/Tutorials/1_Embedding/1.2.3_BGE_v1%261.5.ipynb>`_
 
-        model = FlagModel('BAAI/bge-base-en-v1.5', devices=0)
+`Example <https://github.com/FlagOpen/FlagEmbedding/tree/master/examples/inference/embedder/encoder_only>`_
diff --git a/docs/source/bge/index.rst b/docs/source/bge/index.rst
@@ -11,9 +11,3 @@ BGE
    bge_m3
    bge_icl
 
-.. toctree::
-   :maxdepth: 1
-   :caption: Embedder
-
-   bge_reranker
-
diff --git a/docs/source/community/index.rst b/docs/source/community/index.rst
@@ -1,2 +1,12 @@
 Community
-=========
+=========
+
+Visit our `GitHub repo <https://github.com/FlagOpen/FlagEmbedding>`_ and 
+`Hugging Face collection <https://huggingface.co/collections/BAAI/bge-66797a74476eb1f085c7446d>`_ for more materials!
+
+We are also holding WeChat groups for for BGE. Scan the QR code to join the group chat! 
+To get the first hand message about our updates and new release, or having any questions or ideas, join us now!
+
+.. figure:: ../_static/img/BGE_WeChat_Group.png
+   :width: 400
+   :align: center
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -36,8 +36,8 @@
 
 # html_theme = 'furo'
 html_theme = "pydata_sphinx_theme"
-html_logo = "_static/img/BAAI_logo.png"
-html_title = "FlagEmbedding"
+# html_logo = "_static/img/BAAI_logo.png"
+html_title = "BGE"
 html_static_path = ['_static']
 html_css_files = ["css/custom.css"]
 html_theme_options = {
@@ -80,7 +80,7 @@
             "icon": "fa-solid fa-cube",
         }
     ],
-    "header_links_before_dropdown": 7,
+    "header_links_before_dropdown": 5,
 }
 
 html_context = {

diff --git a/docs/source/tutorial/index.rst b/docs/source/tutorial/index.rst
@@ -1,6 +1,10 @@
 Tutorials
 =========
 
+In this section, we provide hands on introduction to different topics that highly related to embedding models and retrieval. 
+
+To run the tutorials, clone the GitHub repo and check the `Tutorials <https://github.com/FlagOpen/FlagEmbedding/tree/master/Tutorials>`_ folder.
+
 .. toctree::
    :hidden:
    :maxdepth: 1