From bee1801ccce4931ecaba4941c4f9c4b0670edf8b Mon Sep 17 00:00:00 2001 From: Dunstan Matekenya Date: Wed, 6 Nov 2024 05:13:47 -0500 Subject: [PATCH] Re-cloned to deal with API key issue --- README.md | 386 +--- docs/_toc.yml | 49 +- docs/course-requirements/data-science.md | 24 + docs/course-requirements/learning-python.md | 26 + docs/course-requirements/platforms.md | 90 + .../course-requirements/python-environment.md | 70 + docs/malawi-nov-24/README.md | 23 + docs/tunisia-may-24/README.md | 20 + docs/tunisia-may-24/module-1.md | 35 + docs/tunisia-may-24/module-2.md | 58 + docs/tunisia-may-24/module-3.md | 39 + docs/tunisia-may-24/module-4.md | 26 + docs/tunisia-may-24/project-ideas.md | 44 + .../streamlit-app-deployment.md | 55 + notebooks/malawi-nov-24/1-text2sqL-demo.ipynb | 647 ++++++ ...document-classification-with-sklearn.ipynb | 575 +++++ .../malawi-nov-24/3-intro-langchain.ipynb | 1898 +++++++++++++++++ notebooks/malawi-nov-24/README.md | 87 + notebooks/nasa-apod.ipynb | 282 --- .../tunisia-may-24/1-text2sqL-demo.ipynb | 647 ++++++ ...document-classification-with-sklearn.ipynb | 575 +++++ .../tunisia-may-24/3-intro-langchain.ipynb | 1896 ++++++++++++++++ notebooks/tunisia-may-24/README.md | 87 + notebooks/world-bank-api.ipynb | 721 ------- notebooks/world-bank-package.ipynb | 281 --- src/template/__init__.py | 7 - src/template/indicators.py | 83 - src/tunisia/streamlit_app.py | 81 + 28 files changed, 7068 insertions(+), 1744 deletions(-) create mode 100644 docs/course-requirements/data-science.md create mode 100644 docs/course-requirements/learning-python.md create mode 100644 docs/course-requirements/platforms.md create mode 100644 docs/course-requirements/python-environment.md create mode 100644 docs/malawi-nov-24/README.md create mode 100644 docs/tunisia-may-24/README.md create mode 100644 docs/tunisia-may-24/module-1.md create mode 100644 docs/tunisia-may-24/module-2.md create mode 100644 docs/tunisia-may-24/module-3.md create mode 100644 docs/tunisia-may-24/module-4.md create mode 100644 docs/tunisia-may-24/project-ideas.md create mode 100644 docs/tunisia-may-24/streamlit-app-deployment.md create mode 100644 notebooks/malawi-nov-24/1-text2sqL-demo.ipynb create mode 100644 notebooks/malawi-nov-24/2-document-classification-with-sklearn.ipynb create mode 100644 notebooks/malawi-nov-24/3-intro-langchain.ipynb create mode 100644 notebooks/malawi-nov-24/README.md delete mode 100644 notebooks/nasa-apod.ipynb create mode 100644 notebooks/tunisia-may-24/1-text2sqL-demo.ipynb create mode 100644 notebooks/tunisia-may-24/2-document-classification-with-sklearn.ipynb create mode 100644 notebooks/tunisia-may-24/3-intro-langchain.ipynb create mode 100644 notebooks/tunisia-may-24/README.md delete mode 100644 notebooks/world-bank-api.ipynb delete mode 100644 notebooks/world-bank-package.ipynb delete mode 100644 src/template/__init__.py delete mode 100644 src/template/indicators.py create mode 100644 src/tunisia/streamlit_app.py diff --git a/README.md b/README.md index 0d4db00..d2c183a 100644 --- a/README.md +++ b/README.md @@ -1,378 +1,58 @@ -# Project Template +# Practical Guide to LLMs and Generative AI -[![CalVer](https://img.shields.io/badge/calver-YY.0M.MICRO-22bfda.svg)](https://calver.org) -[![GitHub Release](https://img.shields.io/github/v/release/worldbank/template)](https://github.com/worldbank/template/releases) -[![pre-commit.ci status](https://results.pre-commit.ci/badge/github/worldbank/template/main.svg)](https://results.pre-commit.ci/latest/github/worldbank/template/main) +This repository contains information about a generative AI and LL course -The template is a standardized, but flexible *project* and *documentation* structure of folders and files for sharing your data science work. +## Course Overview +The emergence of Generative Artificial Intelligence (AI) and Large Language Models (LLMs) has transformed the field of natural language processing. These technologies offer powerful capabilities in text generation and language understanding, adding value to various processes across numerous fields. In this course, we explore the different ways Gen AI can enhance workflows, from data analysis to knowledge sharing and interactive applications. This introductory course aims to demystify generative AI and LLMs for both technical and non-technical audiences across diverse industries. It provides a comprehensive overview of foundational knowledge for understanding LLMs, core concepts in Gen AI, and practical applications of Gen AI in a variety of contexts. -Inspired by [literate programming](http://literateprogramming.com), maintained by the [Development Data Group](https://www.worldbank.org/en/about/unit/unit-dec/dev) and built as [GitHub template repository](https://docs.github.com/en/repositories/creating-and-managing-repositories/creating-a-repository-from-a-template), the template contains: +The course is intended for professionals like analysts, researchers, and other domain experts, equipping them with skills to enhance their work and build applications using LLMs. By the end of the course, participants will have a foundational understanding of Gen AI principles (machine learning and deep learning), the landscape of Gen AI and LLMs (common open-source and proprietary models), and the ability to create applications that utilize LLMs in meaningful ways. -- [**README**](README), [**CODE_OF_CONDUCT**](docs/CODE_OF_CONDUCT.md), [**CONTRIBUTING**](docs/CONTRIBUTING.md) templates - > README files are important and often neglected. The files should inform anyone about the first steps to use, learn and contribute to your project. +## Course Topics -- [**CITATION.cff**](CITATION.cff) - > Embracing [CFF](https://citation-file-format.github.io) aligns with best practices for reproducible research and software development. By adhering to established standards for documenting project dependencies and citations, we demonstrate our commitment to quality, transparency, and integrity in our work. +The course will cover the following topics: -- [**LICENSE**](LICENSE) - > The LICENSE is a document that determines what others can and cannot do with contents of the repository. If no license is present, no one has permission to use and/or modify your code. The template is licensed under the [**Mozilla Public License**](https://www.mozilla.org/en-US/MPL/). And so will projects generated from it. For further information, see also [this discussion](https://github.com/orgs/worldbank/discussions/4). +1. **AI Foundations** + This module provides preliminary knowledge in machine learning to better understand generative AI. -- **docs/** +2. **Introduction to Generative AI and LLMs** + This module delves into the core concepts behind LLMs, including their structure, components, common models, and usage. - > Documentation is often never prioritized until last minute. The template aims to revert the malpractice by setting up the documentation as an integral part, inspired by [literate programming](http://literateprogramming.com). With the power of [Jupyter Book](https://jupyterbook.org), data practitioners have a way to share [Jupyter notebooks](https://jupyter.org) on [GitHub Pages](https://pages.github.com) in a standardized and effortless way. +3. **Overview of Gen AI Applications in Data Work** + This module examines how Gen AI can be applied to various stages of data-related processes by focusing on the data value chain. -- [**docs/bibliography.bib**](/docs/bibliography.bib) - > A `bibliography` using the [BibTeX](https://www.bibtex.org/Format/) format. Use this file to include and cite your project's bibliography. See also [Citations and bibliographies](https://jupyterbook.org/en/stable/content/citations.html). +4. **Leveraging Gen AI and LLMs for User-Friendly Data Dissemination** + This module focuses on data dissemination and demonstrates different ways to create user-friendly dissemination products that cater to diverse audiences. -- **data/** - > Placeholder folder for data. Data is immutable. By default, the data folder is present but ignored from version control, in order to prevent files of being mistakenly versioned in the code repository. +5. **Case Studies and Project Work** + To solidify the concepts, course participants will undertake a project to create a solution using LLMs at the end of the course. -- **src/** - > Placeholder folder for source code. If Python, it is recommended the package is made pip-installable. +## Course Structure -- **notebooks/** - > Placeholder folder for [Jupyter notebooks](https://jupyter.org). Markdown files and Jupyter notebooks can be added to `docs/_toc.yml` (Table of Contents) to compose the *documentation*. +The course is divided into self-contained modules, each designed to provide useful skills and knowledge. The modules are organized sequentially to build on skills learned in previous modules. To make the course engaging and informative, each module includes the following components: -- [**.pre-commit-config.yml**](https://github.com/worldbank/template/blob/main/.pre-commit-config.yaml) - > Using [pre-commit](https://pre-commit.com) offers a significant advantage in streamlining the development process by enforcing code standards and reducing errors before code reaches the review stage or is committed to the repository. It automates the execution of various checks, such as syntax errors, code formatting, and ensuring compliance with coding standards, which saves time and improves code quality. +- **Lecture** + Each lecture covers key conceptual knowledge for the topic at hand. -- [GitHub Actions](https://github.com/features/actions) and [Dependabot](https://docs.github.com/en/code-security/dependabot) - > [GitHub Actions](https://github.com/features/actions) and [Dependabot](https://docs.github.com/en/code-security/dependabot) are two powerful features provided by [GitHub](https://github.com) to automate and secure software development workflows, making it easier for developers to maintain high-quality and safe codebases. +- **Practical Labs** + Programming activities provide learners with practical skills to implement solutions discussed in lectures. These labs include adaptable recipes for various use cases. -- [GitHub Issues and Pull Requests GitHub](https://docs.github.com/en/communities/using-templates-to-encourage-useful-issues-and-pull-requests/configuring-issue-templates-for-your-repository) - > GitHub allows to customize how issues and pull requests are presented to the public. Custom templates encourage collaboration and maintainability. +- **Case Studies** + Case studies showcase elaborate projects that demonstrate real-world applications. -## Benefits +- **Assessment** + Each module assessment combines theoretical and programming questions to evaluate learners' understanding of the concepts and skills covered in the module. -Project templates on GitHub are essential for streamlining the data science and collaboration processes, and they offer several key benefits: +## Course Sessions +This course has been delivered in different formats to cater to various audiences. The initial session took place in Tunisia in May 2024, designed for statisticians and data scientists with a focus on applications relevant to their fields. The upcoming iteration on November 20 - 21 in Malawi will be adapted for a broader audience, primarily IT professionals, to provide them with the skills to leverage generative AI and LLMs in their own domains. -- 🛠️ **Consistency and Best Practices:** Project templates encourage consistency in project structure, coding standards, and best practices. They provide a standardized starting point, ensuring that all team members follow the same guidelines and reduce the risk of introducing errors. +## Repository Structure and Contents +This repository serves as the primary resource for accessing course content, including slides, Python programming labs, example applications using LLMs, and additional materials to support learning about Generative AI and building applications with LLMs. For easy navigation, use the link and contents outlined below. -- ⏳ **Time and Effort Savings:** Templates save time by eliminating the need to set up a project from scratch. Developers can quickly start working on their projects without the overhead of configuring the initial project structure, dependencies, or workflows. +### Contents -- 🚀 **Faster Onboarding:** New team members or contributors can easily get up to speed by using project templates. It simplifies the onboarding process, allowing them to understand the project structure and development practices more quickly. - -- 🎨 **Customization and Adaptability:** GitHub project templates can be customized to suit the specific needs of different types of projects or organizations. They serve as a foundation that can be adapted to meet unique requirements. - -- 🤝 **Community Engagement:** Open-source projects can attract more contributors when they provide accessible project templates. These templates facilitate contributions by reducing the barriers to entry for potential collaborators. - -- 🔄 **Version Control Integration:** GitHub project templates are tightly integrated with Git version control. This makes it easier to manage changes, collaborate, and track the history of project configurations. - -- 📖 **Documentation and Guidance:** Templates often include documentation and guidance to help developers understand the project's structure and how to get started. This can include README files, code comments, and links to relevant resources. - -- 🔍 **Discoverability:** Templates are discoverable on GitHub, making it easy for developers to find and use project templates for their preferred programming languages, frameworks, and tools. This helps build a supportive ecosystem. - -- ✍️ **Continual Improvement:** Project templates can evolve and improve over time as best practices, technology, and requirements change. This ensures that projects remain up to date and maintainable. - -In summary, GitHub project templates are valuable resources that enhance project management, development practices, and collaboration. They promote consistency, efficiency, and quality in software development, whether for individual projects, open-source contributions, or within organizational contexts. - -```{important} -*With flexibility comes great responsibility*. The template makes a few opiniated choices for the structure and code/documentation management of a project for what we envision to be most cases. However, even the best of the templates would never be perfect for the universe of cases out there. All in all, the template aims to encourage teams to start thinking and assimilate **collaborative coding**, **documentation**​, **enginerring**, **reproducibility​** and **best practices** as an integral part of the project. *In a standardized way*. - -In this spirit, if the template is not for you or in case you have feedback, please consider [opening an issue](https://github.com/worldbank/template/issues) or [submitting a pull request](https://github.com/worldbank/template/pulls) to share your ideas and suggestions. Your contributions would be appreciated immensely. -``` - -## Usage - -### Getting Started - -```{margin} ✨ Can't see the template ? -Please ensure you are logged in on [GitHub](https://github.com) and have permissions to create a repository. -``` - -#### 1. **Create new repository from template** - -The template is a [GitHub template repository](https://docs.github.com/en/repositories/creating-and-managing-repositories/creating-a-repository-from-a-template); in other words, you can generate a new GitHub repository with the same files and folders to use as the starting point for your project. - -> 🌟 [Create new repository from **template**](https://github.com/worldbank/template/generate) - -```{figure} docs/images/github-template.png ---- ---- -``` - -Now, give your repository a name, choose the **visibility** (Public or Private) and click **Create repository from template**. - -```{figure} docs/images/github-template-create.png ---- ---- -``` - -*Voilà!* The repository has been created with the same files and folders of the template. - -```{seealso} -For additional information, see the [GitHub documentation](https://docs.github.com/en/repositories/creating-and-managing-repositories/creating-a-repository-from-a-template) -``` - -#### 2. **Enable [GitHub Actions](https://github.com/features/actions) and [GitHub Pages](https://pages.github.com)** - -After creating the repository from the template, you will have to enable [GitHub Actions](https://github.com/features/actions) and [GitHub Pages](https://pages.github.com) to allow the [Jupyter Book](https://jupyterbook.org) to be built and published. - -To activate the workflow, please enable [GitHub Actions](https://github.com/features/actions) by going to the repository's settings (`Settings > Actions > General`), and selecting **read and write permissions** as shown below. - -```{figure} docs/images/github-template-action-enable.png - --- - --- -``` - -To publish, please enable [GitHub Pages](https://pages.github.com) by going to the repository's settings (`Settings > Pages`), and selecting to deploy from the **GitHub Actions** option. - -```{figure} docs/images/github-template-pages.png ---- ---- -``` - -On the next push to `main`, the [Jupyter Book](https://jupyterbook.org) will be automatically built and published. You can check the progress on the `Actions` tab. - -```{figure} docs/images/github-template-action.png ---- ---- -``` - -```{caution} -The *documentation* can be published from either *public* and *private* repositories. If publishing private content, please remember to carefully select the content to be made public and to abide by your organization's Data Privacy Policy. -``` - -#### 3. **Update configurations** - -The template comes with a default `docs/_config.yml` Jupyter Book configuration file. Remember to update it to reflect your project's name and details. - -```yaml -repository: -url: https://github.com/worldbank/template -branch: main -``` - -```{seealso} -[Jupyter Book Configuration Reference](https://jupyterbook.org/en/stable/customize/config.html) +```{tableofcontents} ``` -#### 4. **Review and update README files** - -The template comes with README files - including [this **README**](README) - that should provide anyone with the information about the first steps to use, learn and contribute to your project. Please **replace** and/or **repurpose** the files with instructions and detailed information about your project. - -> - **CODE_OF_CONDUCT** -> - **CONTRIBUTING** -> - **README** -> - Issues and Pull Requests GitHub templates - -```{seealso} -[Awesome README](https://github.com/matiassingers/awesome-readme) -``` - -#### 5. **Choose a license** - -The template is licensed under the [**Mozilla Public License**](https://www.mozilla.org/en-US/MPL). A LICENSE is the document that guarantees the repository can be shared, modified and receive contributions. Otherwise, if no license is present, all rights are reserved. - -
- -**Congratulations!** You just created a beautiful home for your project. To access your project page, use (and share) the link as shown below. - -> 🌟 `https://.github.io/` - -````{note} -For example, you can view [this live demo](http://worldbank.github.io/template) using the following link: - -> 🌟 [Live Demo - worldbank.github.io/template](http://worldbank.github.io/template) - -You can also install the latest version directly from the main branch: - -```bash -pip install git+https://github.com/worldbank/template -```` - -### Add content - -The template is created as a [Jupyter Book](https://jupyterbook.org/intro.html) - an open-source project to build beautiful, publication-quality books and documents from computational content. Let's see below how to add, execute and publish new content for your project. - -#### Updating the Jupyter Book `_config.yml` metadata - -To configure your Jupyter Book for your project, you’ll need to update the `_config.yml` file. This file controls various aspects of the Jupyter Book, including the project title, description, and relevant URLs. Below is a template to update this file to reflect the project’s details. - -```yaml -# Book settings -title: -author: - -repository: -url: https://github.com// - -# Jupyter Book options -execute: - execute_notebooks: "auto" # Automatically execute notebooks during the build process -``` - -#### Update table of contents - -When ready to publish the *documentation* on [GitHub Pages](https://pages.github.com/), all you need to do is edit the [table of contents](https://github.com/worldbank/template/blob/main/docs/_toc.yml) and add and/or update content you would like to display. [Jupyter Book](https://jupyterbook.org) supports content written as [Markdown](https://daringfireball.net/projects/markdown/), [Jupyter](https://jupyter.org) notebooks and [reStructuredText](https://docutils.sourceforge.io/rst.html) files and the `docs/_toc.yml` file controls the [table of contents](https://github.com/worldbank/template/blob/main/docs/_toc.yml) of your book. - -The template comes with the [table of contents](https://github.com/worldbank/template/blob/main/docs/_toc.yml) below as an example. - -```yaml - -format: jb-book -root: README - -parts: - - - caption: Examples - numbered: True - chapters: - - file: notebooks/world-bank-api.ipynb - - file: notebooks/world-bank-package.ipynb - - file: notebooks/nasa-apod.ipynb - - file: notebooks/bibliography.ipynb -``` - -```{seealso} -[Jupyter Book Structure and organize content](https://jupyterbook.org/en/stable/basics/organize.html) -``` - -#### Add executable content - -[Jupyter Notebooks](https://jupyter.org) can be beautifully rendered and downloaded from your book. By default, the template will render any files listed on the [table of contents](#update-table-of-contents) that have a notebook structure. The template comes with a Jupyter notebook example, `notebooks/world-bank-api.ipynb`, to illustrate. - -```{important} - -By default, Jupyter notebooks are **not** executed. However, you can configure[Jupyter Book](https://jupyterbook.org) to run notebooks during the build process (on GitHub), allowing **code outputs** and **interactive visualizations** to be generated and included in the *documentation* automatically. When enabled, Jupyter notebooks are executed by [GitHub Actions](https://github.com/features/actions) each time a commit is made to the `main` branch. For this to work, it’s crucial to ensure that all necessary [dependencies](##use-pyproject-toml-for-python-package-management) are included in the repository. If you want to prevent a specific notebook from being executed, you can [exclude it from execution](https://jupyterbook.org/en/stable/content/execute.html#exclude-files-from-execution). -``` - -```{seealso} -[Jupyter Book Write executable content](https://jupyterbook.org/en/stable/content/executable/index.html) -``` - -#### Distributing Your Project as a Python Package - -If your project uses [Python](https://python.org), it’s highly recommended to distribute it as a [package](https://packaging.python.org/en/latest/tutorials/packaging-projects/). By including a `pyproject.toml` file, the packaging process becomes more streamlined - *trust me [things can get intense](https://imgs.xkcd.com/comics/python_environment.png)*. - -Additionally: - -```{tip} -- Using `pyproject.toml` future-proofs your setup by aligning with modern packaging standards. -- The `pyproject.toml` file acts as a single source of truth for your Python dependencies and project metadata. -- You can combine Conda for system-level dependencies with `pyproject.toml` for Python dependencies, using Conda for environments and pip/poetry for Python packages. -- Any packages in the `src/` folder will be automatically discovered and installed. -``` - -##### Use `pyproject.toml` for Python Package Management - -While the template recommends using [Conda](https://conda.io/projects/conda/en/latest/index.html) (or [Mamba](https://github.com/mamba-org/mamba)) as the environment manager and managing dependencies through an `environment.yml` file, there is an alternative approach that leverages `pyproject.toml`. This can be particularly advantageous if your project is a Python package or if you want to simplify and standardize the management of Python-specific dependencies. - -##### Why use `pyproject.toml`? - -The next step is ensure your code is maintainable, reliable and reproducible by including -any dependencies and requirements, such as packages, configurations, secrets (template) and additional instructions. - -1. **Standardization**: `pyproject.toml` is a modern, standardized format defined by [PEP 518](https://peps.python.org/pep-0518/) and [PEP 621](https://peps.python.org/pep-0621/) that centralizes project configuration in Python projects, including build requirements and dependencies. - -2. **Python Packaging**: If your project is to be distributed as a package, `pyproject.toml` is the preferred way to define build tools (like [hatch](https://hatch.pypa.io/latest/config/dependency/) or [poetry](https://python-poetry.org)) and metadata for your package (like name, version, dependencies, etc.). It allows tools like `pip` and `build` to install and package your project more effectively. - -3. **Compatibility with Tools**: The `pyproject.toml` file is compatible with multiple Python packaging and dependency management tools such as `poetry` and `pip`. This allows for smoother integration with CI/CD pipelines, PyPI, and other environments. - -4. **Separation of Concerns**: While Conda manages both system-level and Python-specific packages, using `pyproject.toml` helps isolate Python dependencies. This is useful if your project uses primarily Python packages and you want finer control over Python versioning and dependency resolution. - -#### Example: Using `pyproject.toml` - -This `pyproject.toml` file specifies the dependencies and other metadata for your Python package. You can install these packages using `pip`, ensuring that your Python environment is properly managed. You can still use Conda for system-level packages (such as `libc`, `gdal`, etc.), while using `pyproject.toml` for Python package management. - -1. **`pyproject.toml` Example**: - - ```toml - [build-system] - requires = ["hatchling>=1.21.0", "hatch-vcs>=0.3.0"] - build-backend = "hatchling.build" - - [project] - name = "template" - description = "A data science project" - readme = { file = "README.md", content-type = "text/markdown" } - license = { file = "LICENSE" } - authors = [ - { name = "Your Name", email = "your.email@example.com" } - ] - dynamic = ["version"] - - python = ">=3.9" - dependencies = [ - "pandas>=1.4.3,<2", - ] - [project.optional-dependencies] - docs = [ - "docutils==0.17.1", - "jupyter-book>=1,<2", - ] - - [tool.hatch.build.targets.sdist] - include = [ - "src/**/*" - ] - - [tool.hatch.version] - source = "vcs" - ``` - -2. **Keep the Conda Environment for System-level Packages**: - You can continue to use `environment.yml` to specify non-Python dependencies or packages not available on PyPI, such as `mamba` or `gdal`. - - ```yaml - channels: - - conda-forge - dependencies: - - python=3.9 - - mamba - - gdal - ``` - -3. **Installation**: - To create an environment, you would first install the Conda dependencies and then use `pip` to install Python-specific dependencies from `pyproject.toml`. Alternatively, you can skip Conda and use `pip` for the entire setup. - - ```shell - # Create Conda environment - conda env create -f environment.yml -n - - # Activate the environment - conda activate - - # Install Python dependencies - pip install . - ``` - - To install a Python package directly from a [GitHub](https://github.com) repository using [pip](https://pip.pypa.io/en/stable/installation/), you can use the command pip install `git+https://github.com//.git`. This allows you to install the latest version of the package from the repository. You can also specify a particular branch or release tag by adding `@` at the end of the URL This is particularly useful when you want to access features or fixes that haven’t been published on PyPI yet, or to get the latest updates from the repository. - - If you want to install the latest release, you should specify the tag associated with that release. For instance: - - ```shell - pip install git+https://github.com//.git@ - ``` - -```{seealso} -- [Packaging Python Projects](https://packaging.python.org/en/latest/tutorials/packaging-projects/) -- [Writing your pyproject.toml](https://packaging.python.org/en/latest/guides/writing-pyproject-toml/) -- [Conda Managing Environments](https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html) -``` - -#### Building Documentation Locally - -To build the documentation locally, please follow these steps: - -- Install the package with documentation dependencies: - - ```shell - pip install -e .[docs] - ``` - -- Build the documentation: - - ```shell - jupyter-book build . --config docs/_config.yml --toc docs/_toc.yml - ``` - -The generated documentation will be available in the `_build/html` directory. Open the `index.html` file in a web browser to view it. - -## Code of Conduct - -The template maintains a [Code of Conduct](docs/CODE_OF_CONDUCT.md) to ensure an inclusive and respectful environment for everyone. Please adhere to it in all interactions within our community. - ## License The template is licensed under the [**Mozilla Public License**](https://www.mozilla.org/en-US/MPL). Remember to replace the [license](LICENSE) if necessary. If open source, [choose an open source license](https://choosealicense.com). diff --git a/docs/_toc.yml b/docs/_toc.yml index 130c2a0..f6a32cd 100644 --- a/docs/_toc.yml +++ b/docs/_toc.yml @@ -2,23 +2,38 @@ format: jb-book root: README parts: - - caption: Examples - numbered: True + - caption: Course Requirements chapters: - - file: notebooks/world-bank-api.ipynb - - file: notebooks/world-bank-package.ipynb - - file: notebooks/nasa-apod.ipynb - - file: notebooks/bibliography.ipynb - - caption: Gallery + - file: docs/course-requirements/learning-python + - file: docs/course-requirements/python-environment + - file: docs/course-requirements/data-science + - file: docs/course-requirements/platforms + - caption: Tunisia, May 2024 chapters: - - file: docs/gallery - - caption: Additional Resources + - file: docs/tunisia-may-24/README + - file: docs/tunisia-may-24/module-1 + - file: docs/tunisia-may-24/module-2 + - file: docs/tunisia-may-24/module-3 + - file: docs/tunisia-may-24/module-4 + - file: docs/tunisia-may-24/project-ideas + - file: notebooks/tunisia-may-24/README + sections: + - file: notebooks/tunisia-may-24/1-text2sqL-demo.ipynb + - file: notebooks/tunisia-may-24/2-document-classification-with-sklearn.ipynb + - file: notebooks/tunisia-may-24/3-intro-langchain.ipynb + - caption: Malawi, Upcoming, November 2024 chapters: - - url: https://datapartnership.org - title: Development Data Partnership - - url: https://wbdatalab.org - title: World Bank Data Lab - - url: https://www.worldbank.org/en/about/unit/unit-dec - title: World Bank DEC - - url: https://www.worldbank.org/en/research/dime - title: World Bank DIME + - file: docs/tunisia-may-24/README + - file: docs/tunisia-may-24/module-1 + - file: docs/tunisia-may-24/module-2 + - file: docs/tunisia-may-24/module-3 + - file: docs/tunisia-may-24/module-4 + - file: docs/tunisia-may-24/project-ideas + - file: notebooks/tunisia-may-24/README + sections: + - file: notebooks/malawi-nov-24/1-text2sqL-demo.ipynb + - file: notebooks/malawi-nov-24/2-document-classification-with-sklearn.ipynb + - file: notebooks/malawi-nov-24/3-intro-langchain.ipynb + - caption: Acknowledgements + chapters: + - file: docs/team diff --git a/docs/course-requirements/data-science.md b/docs/course-requirements/data-science.md new file mode 100644 index 0000000..6af40cd --- /dev/null +++ b/docs/course-requirements/data-science.md @@ -0,0 +1,24 @@ + +# Data Science Prerequisites + +In this section, we outline the foundational skills and knowledge in data science, including key areas such as machine learning and natural language processing (NLP), required not only to complete exercises in this course but also to grasp and understand the core concepts of LLMs that will be taught. These prerequisites will provide the essential background needed to effectively work with LangChain and build LLM-based applications. + +## Prerequisite Skills in Data Science +A strong foundation in data science, machine learning, and NLP is crucial for building advanced LLM-based applications. These skills will enable efficient data handling, model building, and language processing, which are fundamental for working with LLMs in real-world scenarios. Below is a list of recommended skills to help you maximize your learning in this course. + +- **Data Science Basics**: Familiarity with data manipulation and analysis, especially using libraries like `pandas` and `numpy`. +- **Machine Learning Fundamentals**: Knowledge of core ML algorithms (e.g., linear regression, decision trees, k-nearest neighbors) and concepts such as overfitting, training/testing splits, and evaluation metrics. +- **Deep Learning Basics**: Basic understanding of neural networks, including feedforward networks and concepts like activation functions, training, and backpropagation. +- **Natural Language Processing (NLP) Basics**: Familiarity with NLP concepts such as tokenization, word embeddings, and basic text processing techniques. +- **Working with ML Frameworks**: Experience with libraries like `scikit-learn` for traditional ML models and `TensorFlow` or `PyTorch` for deep learning. + +## Recommended Free Resources +To help you build the required skills in data science, machine learning, and NLP, we’ve compiled a list of free resources. These cover essential topics and tools needed to work with LangChain and LLM-based applications effectively. Whether you’re new to these fields or looking to deepen your understanding, these resources will be valuable in building your foundational knowledge. + + +| Focus | Provider | Duration | Course URL | +|--------------------------|------------------------|------------|------------------------------------------------------------------------------------------------------| +| Machine Learning Basics | Google Developers | 8 hours | [ML Intro with scikit-learn](https://developers.google.com/machine-learning/crash-course) | +| NLP with Transformers | Hugging Face | 4 hours | [Hugging Face Transformers](https://huggingface.co/learn/nlp-course/chapter1) | +| NLP Basics | fast.ai | 3 hours | [NLP with fast.ai](https://course.fast.ai/) | +| Machine Learning Basics | Coursera (Andrew Ng) | 60 hours | [Coursera ML course](https://www.coursera.org/learn/machine-learning) | diff --git a/docs/course-requirements/learning-python.md b/docs/course-requirements/learning-python.md new file mode 100644 index 0000000..27ccd86 --- /dev/null +++ b/docs/course-requirements/learning-python.md @@ -0,0 +1,26 @@ +# Python Environment Configuration +In this section, we provide the minimal Python packages required to complete the programming exercises in this course. We are saying minimal because for some of the project work, you may need extra packages + +## Prerequisite Python Skills +A solid foundation in core Python skills is essential for building LLM-based applications with LangChain. These prerequisites enable efficient coding, debugging, and API interaction, which are critical for working effectively with language models. Below is a list of recommended skills to help you maximize your learning in this course. + +- **Basic Python Programming**: Understanding variables, data types, and control structures (loops and conditionals). +- **Functions and Modules**: Ability to create and use functions, import modules, and manage dependencies. +- **Object-Oriented Programming (OOP)**: Familiarity with classes, objects, inheritance, and basic OOP principles. +- **Working with APIs**: Understanding how to make HTTP requests and handle API responses, ideally with libraries like `requests`. +- **File I/O**: Reading from and writing to files, especially working with text files and JSON data. +- **Environment Management**: Experience with virtual environments (`venv`, `conda`) and package management with `pip`. +- **Error Handling**: Understanding of exceptions and error handling in Python. +- **Jupyter Notebooks**: Experience working with Jupyter Notebooks, especially for experimenting with and testing code interactively. + +These prerequisites will provide a solid foundation for building applications with LangChain and LLMs. + + +## Recommended Free Resources +To support you in building the necessary Python skills for this course, we’ve compiled a list of free resources to help you learn or review key concepts. These resources cover everything from basic programming to more advanced topics, ensuring you have a solid foundation for working with LangChain and LLM-based applications. Whether you're new to Python or just need a refresher, these materials will provide valuable guidance. +| Focus | Provider | Duration | Course URL | +|--------------|--------------|------------|-----------------------------------| +| Basic Python | Codecademy | 25 hours | [Codecademy Python](https://www.codecademy.com/learn/learn-python-3) | +| Basic Python | DataCamp | 4 hours | [Python for Data Science](https://www.datacamp.com/courses/intro-to-python-for-data-science) | +| Basic Python | Google | 2 days | [Google Python Course](https://developers.google.com/edu/python) | +| Basic Python | Udemy | 4 hours | [Udemy Python Course](https://www.udemy.com/course/python-for-beginners/) | diff --git a/docs/course-requirements/platforms.md b/docs/course-requirements/platforms.md new file mode 100644 index 0000000..9aaf773 --- /dev/null +++ b/docs/course-requirements/platforms.md @@ -0,0 +1,90 @@ +# Required Platforms and Access Setup + +To complete the course exercises and build applications effectively, you will need access to specific platforms. This document outlines the necessary accounts and API keys or tokens required for each platform, organized into three sections: **LLMs**, **Cloud Compute Platforms**, and **Other** (for additional services like Twilio and GitHub). + +## 1. LLM Platforms + +In this section, we cover the required access for platforms that provide large language models (LLMs) and related resources. + +### OpenAI Developer API Key + +To access OpenAI’s models programmatically, you need an OpenAI API key. Follow these steps: + +1. **Create an OpenAI Account** + Go to [OpenAI’s website](https://platform.openai.com/signup) to sign up. + +2. **Generate an API Key** + - Log in and navigate to [API Keys](https://platform.openai.com/account/api-keys). + - Click on **Create new secret key** to generate a new API key. + - Copy and store the key securely, as it will be needed to authenticate with OpenAI’s API. + +3. **Usage and Billing** + OpenAI offers a free trial, but be mindful of usage limits and potential charges. + +### Hugging Face Token + +To access Hugging Face’s models and datasets programmatically, you’ll need a Hugging Face access token. + +1. **Create a Hugging Face Account** + Sign up at [Hugging Face’s website](https://huggingface.co/join). + +2. **Generate an Access Token** + - Log in, go to **Settings**, and select **Access Tokens**. + - Click **New token**, set a name (e.g., “Course Token”), choose “Read” for access level, and generate the token. + - Copy and save the token for use with Hugging Face’s resources. + +## 2. Cloud Compute Platforms + +This section details required access for cloud-based compute resources. + +### AWS (Amazon Web Services) + +AWS will provide cloud resources for deploying and running applications at scale. + +1. **Create an AWS Account** + Go to [AWS’s website](https://aws.amazon.com/) to create an account. + +2. **Generate Access Keys** + - Log in to the AWS Management Console. + - Navigate to **IAM (Identity and Access Management)** > **Users** and select your user. + - Under **Security credentials**, click **Create access key**. + - Copy and store your Access Key ID and Secret Access Key securely for connecting to AWS services. + +3. **Free Tier Usage** + AWS offers a free tier for new users, which may be sufficient for many course exercises. Monitor usage to avoid unexpected charges. + +## 3. Other Platforms + +This section includes additional services needed for the course. + +### Twilio (for WhatsApp Integration) + +Twilio will enable WhatsApp access, allowing you to build and deploy chatbot applications. + +1. **Create a Twilio Account** + Sign up at [Twilio’s website](https://www.twilio.com/). + +2. **Generate an API Key for WhatsApp** + - After logging in, navigate to **Console** > **API Keys & Tokens**. + - Click on **Create new API Key**, give it a name, and copy the SID and Secret. + - Follow Twilio’s documentation to set up WhatsApp messaging capabilities, including linking your WhatsApp number. + +3. **Free Trial** + Twilio offers a free trial with a small amount of credit, allowing you to experiment with WhatsApp API functionality. Be sure to check usage limits. + +### GitHub (for Project Repository Management) + +GitHub will be used to manage project files and collaborate on code. + +1. **Create a GitHub Account** + Go to [GitHub’s website](https://github.com/) and sign up for an account if you don’t already have one. + +2. **Set Up SSH Keys (Optional)** + To simplify authentication, you may want to set up SSH keys. + - Follow the instructions in GitHub's documentation for [generating SSH keys](https://docs.github.com/en/authentication/connecting-to-github-with-ssh). + - Once set up, add the public key to your GitHub account under **Settings** > **SSH and GPG keys**. + +3. **Forking and Cloning Repositories** + During the course, you will be working with GitHub repositories. Familiarize yourself with forking and cloning repositories to easily access course materials and project files. + +--- diff --git a/docs/course-requirements/python-environment.md b/docs/course-requirements/python-environment.md new file mode 100644 index 0000000..c9490ed --- /dev/null +++ b/docs/course-requirements/python-environment.md @@ -0,0 +1,70 @@ +# Python Environment Configuration +In this section, we provide the minimal Python packages required to complete the programming exercises in this course. We are saying minimal because for some of the project work, you may need extra packages + +## Python Installation +We will be using Python 3.12 for this course. Please refer to the installation options below. + +- **Recommended: Installation with Anaconda**. [Download Anaconda](https://www.anaconda.com/download). For more details about Anaconda, refer to this [blog post](https://www.anaconda.com/blog). + +- **Alternative: Installation from Python Website** +[Download Python](https://www.python.org/downloads/) + +## Python IDE +An IDE (Integrated Development Environment) is a software application that provides programmers with tools for software development, such as a source code editor, compiler, build automation, and debugging tools. Popular Python IDEs include Jupyter Notebook, VS Code, and PyCharm. + +### Jupyter Notebook and Google Colab + +After installing Python, you can proceed to install Jupyter Notebook, the default IDE for data science and scientific computing. Jupyter Notebook allows you to write code and include documentation with Markdown. If you installed Python via the Anaconda distribution, Jupyter Notebook and other commonly used Python packages come pre-installed, saving you additional setup steps. + +In addition to the local Jupyter Notebook installation with Anaconda, you can also use a similar environment on hosted servers like Google Colab. Google Colab is an online Jupyter Notebook accessible via the cloud, offering free GPUs for working with LLMs and other AI-based Python programs. + +### Full-Featured IDEs +While Jupyter Notebooks are excellent for interactive data science work, this course focuses on building a chatbot, which requires a fully-featured IDE. Below are some commonly used IDEs: + +> 🚀 **VS Code**: Recommended IDE for this course.See [installation instructions](https://code.visualstudio.com). + +**Other IDEs** +- **Notepad++** +- **PyCharm** + +## Python Environment Setup +### Major Packages +For the most part, we’ll install packages as needed. However, here’s a list of core packages we’ll require: + +1.Transformers + +2.Pytorch + +3.HuggingFace + +4.Langchain + +The full list of required packages is provided in the ```requirements.txt``` file. + +### Python Environment Setup +#### Create Virtual Environment +Create a Python virtual environment to use for this project. The Python version used when this was developed was 3.12. The code below creates a virtual environment and also installs all the Python packages we need for this tutorial +``` +python -m venv .venv +source .venv/bin/activate +pip install -U pip +pip install -r requirements.txt +``` +#### Setup ```.env``` file +This file is important for keeping your API keys and other secrets +``` +# OpenAI +OPENAI_API_KEY="" +# Hugging Face +HUGGINGFACEHUB_API_TOKEN="" + +# Twilio Credentials +TWILIO_ACCOUNT_SID="" +TWILIO_AUTH_TOKEN="" +TWILIO_NUMBER="" + +# PostgreSQL connection details +DB_USER = "" +DB_PASSWORD = "" +``` + diff --git a/docs/malawi-nov-24/README.md b/docs/malawi-nov-24/README.md new file mode 100644 index 0000000..b7aeda1 --- /dev/null +++ b/docs/malawi-nov-24/README.md @@ -0,0 +1,23 @@ +# LLM Application Development with LangChain and Python +In this iteration of the course, participants will explore how to develop advanced applications using Large Language Models (LLMs) with the LangChain framework in Python. Through practical exercises and real-world case studies, the course dives into the technical aspects of building LLM-powered solutions, covering everything from prompt engineering to integrating various data sources and APIs. Participants will gain hands-on experience in creating dynamic applications, including intelligent chatbots and automated workflows. The course begins by providing a foundational understanding of LLMs—how they are trained and adapted for different domains through techniques like prompt engineering and fine-tuning. It then introduces LangChain, a leading framework for building LLM applications, empowering participants to enhance their business processes with LLMs. Ideal for developers, data scientists, data engineers, analysts, and professionals across industries such as banking, telecommunications, and the public sector, this course equips you with the skills needed to build your first production-grade LLM application. + +The course is structured into self-contained modules, each building on the skills learned in previous ones. Each module includes lectures for key concepts, practical labs with programming activities and modifiable recipes, and case studies that showcase real-world applications. To reinforce learning, assessments combine theoretical and programming questions to evaluate the learner's understanding and skills gained. + + + +## Session Details + +### Audience +This session targeted staff from National Statistical Offices across 13 African countries, including Kenya, Tunisia, Burundi, Niger, Burkina Faso, Senegal, Cameroon, Mali, Côte d'Ivoire, Uganda, Central African Republic (RCA), Tanzania, and Mozambique. + +### Organization +The course was divided into three phases, each tailored to maximize learning and engagement: + +- **Phase 1: Virtual Session** + This brief, 3-hour virtual session introduced participants to the course content and sparked enthusiasm for the in-person session. + +- **Phase 2: In-Person Session** + Conducted over five days, this phase combined two components: a 3-day module on big data, followed by this 2-day LLM course. + +- **Phase 3: Project Implementation** + In this phase, participants applied what they learned in the previous sessions by building LLM-based applications, primarily chatbots, to facilitate the dissemination of information. diff --git a/docs/tunisia-may-24/README.md b/docs/tunisia-may-24/README.md new file mode 100644 index 0000000..f46a871 --- /dev/null +++ b/docs/tunisia-may-24/README.md @@ -0,0 +1,20 @@ +# Generative AI and LLMs for Data Literacy + +The first iteration of this course was delivered in Tunis, Tunisia, from May 27 to May 31, as part of the Data in Health Program organized by the World Bank Group and the African Development Bank. + +## Session Details + +### Audience +This session targeted staff from National Statistical Offices across 13 African countries, including Kenya, Tunisia, Burundi, Niger, Burkina Faso, Senegal, Cameroon, Mali, Côte d'Ivoire, Uganda, Central African Republic (RCA), Tanzania, and Mozambique. + +### Organization +The course was divided into three phases, each tailored to maximize learning and engagement: + +- **Phase 1: Virtual Session** + This brief, 3-hour virtual session introduced participants to the course content and sparked enthusiasm for the in-person session. + +- **Phase 2: In-Person Session** + Conducted over five days, this phase combined two components: a 3-day module on big data, followed by this 2-day LLM course. + +- **Phase 3: Project Implementation** + In this phase, participants applied what they learned in the previous sessions by building LLM-based applications, primarily chatbots, to facilitate the dissemination of information. diff --git a/docs/tunisia-may-24/module-1.md b/docs/tunisia-may-24/module-1.md new file mode 100644 index 0000000..3f4c30d --- /dev/null +++ b/docs/tunisia-may-24/module-1.md @@ -0,0 +1,35 @@ +# Module 1: AI Foundations + +### Module Objectives +The goal of this module is to introduce learners to the fields of machine learning and deep learning. By the end of this module, learners should understand how predictive models are built in Python and be able to distinguish between simple machine learning models, such as linear regression, and deep learning models. Learners will also gain an appreciation for how data is used to build ML models, the process of developing ML models and deploying them to production, and the infrastructure required to support ML systems. + +### Module Topics +- **Machine Learning (ML) and Neural Networks** + - Problem formulation and techniques: Regression, Nearest Neighbors, Tree-Based Models, Clustering, Principal Component Analysis. +- **Major ML Application Areas** + - Natural Language Processing (NLP), Computer Vision, Recommender Systems. +- **Platforms for Building ML Models** + - Python for ML and Data Science. +- **Machine Learning vs. Statistics** + - Similarities and Differences. +- **Tools and Platforms** + - Python, scikit-learn, PyTorch, and cloud-based platforms. +- **Building ML Systems** + - Data preparation, model training and evaluation, model deployment, and serving. + +### ML Use Cases + +### Practical Labs +- **Traditional ML** + - Build a predictive model to replace/impute missing data. + - Build a predictive model for predicting poverty from LSM data. +- **Deep Learning** + - Build a simple computer vision model. + - **Deep Learning-NLP**: Build a document classification system. + +### Case Studies +- **[World Bank]** Small area estimation of poverty. +- **[World Bank]** Object detection from high-resolution satellite imagery. + +### Assessment +- To be determined (TBD). \ No newline at end of file diff --git a/docs/tunisia-may-24/module-2.md b/docs/tunisia-may-24/module-2.md new file mode 100644 index 0000000..6e4229c --- /dev/null +++ b/docs/tunisia-may-24/module-2.md @@ -0,0 +1,58 @@ +# Introduction to Generative AI and LLMs + +### Module Objectives +This module provides foundational knowledge on Large Language Models (LLMs), covering key concepts such as pretraining, foundational models, and adapting LLMs through fine-tuning. Additionally, the module introduces various open-source and proprietary LLMs currently available on the market. + +### Module Topics +- **Introducing Generative AI** + - What is Generative AI? + - How Gen AI differs from Predictive AI. + - Brief history of Gen AI. + - Capabilities of Gen AI and major use cases. + - Different categories of Gen AI (LLMs, image generators, video generators). + +- **Understanding Large Language Models (LLMs)** + - Overview and history of language models—LLMs vs. SLMs. + - Categories of LLMs: Foundation models and other concepts. + - Building LLMs: Transformer architecture and sequence-to-sequence architectures. + - Overview of common LLMs: OpenAI models, Mistral AI, Llama, Gemini, and others. + - Adapting and customizing LLMs: fine-tuning, pre-training, RLHF. + +- **Building and Evaluating LLM Apps** + - Key concepts for LLM apps: prompt engineering, prompt-tuning, vector embeddings, RAG. + - Ecosystem of commercial and open-source tools for building LLM apps (e.g., LangChain). + - Customizing LLMs for specific use cases: prompt engineering, RAG, fine-tuning, RLHF. + - Selecting and evaluating LLMs and LLM apps. + +- **Deploying LLM Apps with LangChain** + - Overview of LangChain features and capabilities. + - Preprocessing and loading data in LangChain. + - Working with different LangChain agents (e.g., SQL). + - Deploying LangChain applications (e.g., with Streamlit, WhatsApp, and web apps). + - Evaluating LangChain apps. + +### Practical Labs + +- **Lab 1: Demonstration of Building an LLM App with Commercial Tools (OpenAI)** + Since participants won’t have access to a paid OpenAI subscription, the instructor will demonstrate available capabilities for building LLM apps. The lab will include: + - Exploring OpenAI features using the ChatGPT GUI (paid version), showing functionalities and how to create assistants. + - Demonstrating a simple RAG-based chatbot using the OpenAI playground. + - Demonstrating a simple RAG-based chatbot using the OpenAI API. + +- **Lab 2: Building LLM Apps Using LangChain (RAG-Based Chatbot)** + Participants will use provided documents to build a RAG-based app with an open-source LLM to query the documents. The output will be a Streamlit app for sharing. Tasks include: + - Setting up the development environment and installing required packages. + - Preparing source data (e.g., health documents). + - Setting up a vector database. + - Preprocessing documents and loading them into the vector database. + - Integrating with an LLM (including selecting the LLM). + - Developing the user interface in Streamlit. + - Deploying and testing the app. + +### Case Study +- **Agricultural Information Q&A System in Malawi** + A RAG-based chatbot deployed in Malawi answers questions from Agricultural Extension workers. This example app uses ChatGPT (OpenAI) integrated with agricultural documents from Malawi and is deployed on WhatsApp. + +### Assessment +- **Build an LLM App with LangChain** + Participants will receive a notebook to create an app that answers questions based on their selected website. Additionally, a quiz with five multiple-choice questions will be administered. diff --git a/docs/tunisia-may-24/module-3.md b/docs/tunisia-may-24/module-3.md new file mode 100644 index 0000000..28d4954 --- /dev/null +++ b/docs/tunisia-may-24/module-3.md @@ -0,0 +1,39 @@ +# Module 3: Gen AI and LLM Applications in Statistics + +### Module Objectives +This module explores both current and potential applications of Gen AI and LLMs in the field of statistics. Covering the entire statistical life cycle—from data collection and processing to analysis and dissemination—we examine how LLMs can enhance each stage. For instance, Ask a Question (AAQ) platforms can interpret and respond to natural language queries, providing relevant statistical information. Learners will be introduced to tools for creating accessible platforms, like a WhatsApp bot, that can answer questions using statistical data as its knowledge base. + +### Module Topics +- **Qualitative and Multi-Modal Data Analysis with LLMs** +- **Advanced Image Analysis in Statistical Data Collection** +- **Text Data Analysis with LLMs** + - Applications like sentiment analysis, parsing web-scraped price data, analyzing qualitative research data, and more. +- **Audio Data Processing and Analysis with Speech Models** + - For example, processing data from focus group discussions (FGDs) or interview data. +- **LLM Applications in Data Dissemination** + - LLMs in data discovery: Semantic search vs. keyword search. + - Enhancing and automating metadata generation with LLMs. + - Statbots: Chatbots that can respond to statistical queries. + +- **Concepts in LLM Statbots** + - LLMs' quantitative reasoning abilities and capacity to work with tabular data. + - Strategies for connecting an LLM to statistical data: Text2SQL, Text2API, Text2Code, and more. + - Tools for parsing and working with tabular documents (e.g., DocumentLLM, LangChain SQL agent). + - Security considerations for Text2SQL and database connections. + +- **Building a Statbot** + - LLM selection guide. + - Tool selection. + - Deploying statbots on platforms like WhatsApp, websites, and more. + +### Practical Labs + +- **Lab 1: Building a Health Statbot** + Participants will use a set of provided documents to build a RAG-based app using an open-source LLM to query the data. The lab will result in a Streamlit app that can be shared. + +### Case Study +- **Accessing Databases with LLMs** + (Details TBD) + +### Assessment +- To be determined (TBD). diff --git a/docs/tunisia-may-24/module-4.md b/docs/tunisia-may-24/module-4.md new file mode 100644 index 0000000..78b0022 --- /dev/null +++ b/docs/tunisia-may-24/module-4.md @@ -0,0 +1,26 @@ +# Module 5: Case Studies and Project Work + +### Module Objectives +This module focuses on real-world use cases, guiding participants through practical applications of LLM technology. + +### Module Topics +- **Implementing LLM Apps** + - Differences between open-source and proprietary models. + - Approaches such as fine-tuning vs. RAG. + - Platform selection and performance evaluation. +- **Major Use Cases in Data Applications** + - Existing use cases (TBD) and potential applications in low-income regions. + - Implementation challenges. +- **Capstone Project** + - Hands-on project work where participants apply the concepts learned throughout the course. + +### Practical Labs +- **Building an LLM Project** + Participants will work on an LLM project designed to apply the knowledge gained in a data-centric use case. + +### Case Study +- **Building an LLM Project for Data Applications** + (Details TBD) + +### Assessment +- **LLM Project Work** diff --git a/docs/tunisia-may-24/project-ideas.md b/docs/tunisia-may-24/project-ideas.md new file mode 100644 index 0000000..9558306 --- /dev/null +++ b/docs/tunisia-may-24/project-ideas.md @@ -0,0 +1,44 @@ +# AI and LLM Project + +In this document, we will guide you through three key considerations for implementing your project in this course: choosing a project, defining acceptable outputs, and understanding the project selection process. + +## Recommended Projects + +This section addresses the question, "What project can I do?" Based on the course content, we provide recommended projects, but you are encouraged to explore other ideas. + +### QA Chatbots +One common use case for generative AI is creating conversational systems like chatbots that can answer questions on specific topics. While chat-GPT and other models can handle general questions, they lack access to custom organizational data. For example, in the health domain, you may want to create a chatbot that answers questions on public health issues in your country. By using LLMs, you can create custom chatbots with access to specialized documents or websites for local knowledge. + +> Note: For QA chatbots, we will focus on those that respond to textual questions rather than numeric or data-intensive information. + +### Statsbots +Similar to the QA chatbot, a Statsbot is designed to answer quantitative questions. LLMs traditionally struggle with numeric data, so specialized tools are necessary for chatbots that work with tabular data and provide accurate, data-driven answers. + +### Miscellaneous Document Analysis +LLMs are highly effective for analyzing documents, classifying them, and performing various NLP tasks. Examples of document analysis projects include: +- Sentiment Analysis +- Topic Classification +- Intent Classification +- Named Entity Recognition (NER) +- Document Type Classification +- Key Phrase Extraction +- Toxicity and Hate Speech Detection + +## Guidelines for Choosing a Project + +Select your project thoughtfully, given the limited time available. Here are factors to consider: + +- **Data Availability**: Ensure that the necessary data or documents are accessible for the project. +- **Skills and Knowledge**: Assess the required platforms or tools and confirm that team members are willing to learn and work with them. +- **Effort**: Be realistic about the project's scope. Certain tasks, like fine-tuning an LLM, may require additional time and resources. +- **Cost**: Some LLM platforms and tools may require subscriptions or fees. For example, using the chat-GPT API requires a developer account with sufficient funds. While paid platforms are sometimes necessary, ensure that you understand the associated requirements. + +## Permissible Project Outputs + +We recommend including three key components as project outputs: + +1. **User Interface** + Implementing LLMs often involves facilitating user interaction with documents, data, or other elements. For a more user-friendly experience, we suggest creating a user interface, such as a web-based UI, WhatsApp chatbot, or command-line tool. + +2. **Documentation on GitHub** + As this is a technical project, you’ll write substantial code. Using a version control system like GitHub is recommended to track yo diff --git a/docs/tunisia-may-24/streamlit-app-deployment.md b/docs/tunisia-may-24/streamlit-app-deployment.md new file mode 100644 index 0000000..dfe7fe0 --- /dev/null +++ b/docs/tunisia-may-24/streamlit-app-deployment.md @@ -0,0 +1,55 @@ +# Deploying a Chatbot on Streamlit +In this activity, you will use the knowledge gained from the LangChain Tutorial to explore a chatbot deployed on Streamlit. You will deploy this app on your computer and interact with it. + +## About Streamlit + +As discussed in the lectures, Streamlit is a platform that enables data scientists to deploy dynamic, data-based apps. It’s ideal for prototyping demonstration apps and sharing them with stakeholders before full-scale production deployment. + +## Initial Setup and Getting the Chatbot Files + +1. **Get OpenAI and Hugging Face API Credentials** + The chatbot uses OpenAI models, so you’ll need to sign up for an OpenAI developer account and obtain an API key. For a step-by-step guide on creating an OpenAI API key, search for instructions on ChatGPT. Similarly, create a Hugging Face account and obtain an API token. + +2. **Try the Chatbot on Streamlit Community Cloud** + Before downloading anything, you can try the chatbot on the Streamlit Community Cloud with just the OpenAI and Hugging Face keys. + +3. **Download or Clone the Project Repository** + To get the project files on your computer, either clone the GitHub repository (if familiar with Git) or download the repository as a zipped file. + +## Deploying the Streamlit App Locally + +1. **Unzip and Navigate to the Project Folder** + Once unzipped, open the project folder and follow the instructions on the GitHub page to deploy the chatbot. + +2. **Follow steps on GitHub project repository**. [Streamlit app repo](https://github.com/worldbank/RAG-Based-ChatBot-Example) + + +3. **Install Required Packages** + The `requirements.txt` file contains a list of all required packages. If you encounter a missing package error, try installing the package again (ensuring your virtual environment is activated). + +4. **Run the App Locally** + Run the app with the following command: + ```bash + streamlit run streamlit_app.py + ``` +5. **Test and Check**. When deployed locally, you can browse the files being used in the app. + +## Explore Important Scripts + +The essential components for building a chatbot with LangChain are organized into distinct, modular Python scripts. Let’s explore some of these elements. You can use VS Code or your preferred text editor for this task. + +### Loading Files +In real-life applications, you may need to load hundreds of documents, requiring a versatile function for file loading. This project includes two types of loaders: +- **`remote_loader.py`**: For loading documents from websites. +- **`local_loader.py`**: For loading documents from the local `data` folder. + +### Document Splitting +The `splitter.py` module uses the `RecursiveCharacterTextSplitter` strategy, with a chunk size of 1000 and an overlap of 0. This method helps in breaking down large documents into manageable sections for processing. + +### Prompt Chains +In the `full_chain.py`, `base_chain.py`, and `rag_chain.py` modules, you’ll find configurations for the specific LLM models and prompting strategies used. The project utilizes OpenAI chat models, with customized chains designed to guide interactions effectively. + +### Memory Management +Memory management strategies are also implemented to optimize the chatbot’s performance, particularly for long interactions or when processing large datasets. + + diff --git a/notebooks/malawi-nov-24/1-text2sqL-demo.ipynb b/notebooks/malawi-nov-24/1-text2sqL-demo.ipynb new file mode 100644 index 0000000..779899e --- /dev/null +++ b/notebooks/malawi-nov-24/1-text2sqL-demo.ipynb @@ -0,0 +1,647 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "5eb5dc4b-add1-4b1d-8fbb-3a1e85e552f7", + "metadata": {}, + "source": [ + "# Chatting with a Population Dataset Using LangChain and LLMs\n", + "\n", + "----\n", + "\n", + "In this simple demonstration, we show how you can use natural language to query a structured dataset. The dataset is a 2018 population census enumeration level data from Malawi." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "a16425c5-c0ee-4bc9-8f80-1684edc5a843", + "metadata": {}, + "outputs": [ + { + "name": "stdin", + "output_type": "stream", + "text": [ + " ········\n" + ] + } + ], + "source": [ + "import getpass\n", + "import pandas as pd\n", + "import os\n", + "\n", + "os.environ[\"OPENAI_API_KEY\"] = getpass.getpass()" + ] + }, + { + "cell_type": "markdown", + "id": "5186c265-b78c-41f6-a4a4-4401e6ccb7cf", + "metadata": {}, + "source": [ + "## 1. Creating a SQLLite Database\n", + "Use a CSV file to create a database. The file which was used to create a database is shown below as a Pandas Dataframe" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "1b2308a3-7bb2-47ca-86c8-01ff060105e4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
RegionNameDistrictNameTANameEnumerationAreaTotalPopulationPopulationMalePopulationFemaleNumberHouseholds
0CentralNtchisiTA Malengaea-20307802633.0331.0302.0145.0
1CentralNtchisiTA Malengaea-203070251006.0507.0499.0226.0
2CentralNtchisiTA Malengaea-203070071503.0740.0763.0338.0
3CentralNtchisiTA Malengaea-203070051139.0553.0586.0251.0
4CentralNtchisiTA Malengaea-203070121400.0668.0732.0284.0
\n", + "
" + ], + "text/plain": [ + " RegionName DistrictName TAName EnumerationArea TotalPopulation \\\n", + "0 Central Ntchisi TA Malenga ea-20307802 633.0 \n", + "1 Central Ntchisi TA Malenga ea-20307025 1006.0 \n", + "2 Central Ntchisi TA Malenga ea-20307007 1503.0 \n", + "3 Central Ntchisi TA Malenga ea-20307005 1139.0 \n", + "4 Central Ntchisi TA Malenga ea-20307012 1400.0 \n", + "\n", + " PopulationMale PopulationFemale NumberHouseholds \n", + "0 331.0 302.0 145.0 \n", + "1 507.0 499.0 226.0 \n", + "2 740.0 763.0 338.0 \n", + "3 553.0 586.0 251.0 \n", + "4 668.0 732.0 284.0 " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_pop = pd.read_csv(\"mw-ea-pop.csv\")\n", + "df_pop.head(5)" + ] + }, + { + "cell_type": "markdown", + "id": "1a28fd50-a6c4-4929-8212-b2f4c9889b80", + "metadata": {}, + "source": [ + "## 2. Setup LangChain for Connecting to Database\n", + "The tool we will use is called LangChain. Its a popular tool for creating apps ontop of LLMs. During the course, we will delve more into using LangChain." + ] + }, + { + "cell_type": "markdown", + "id": "cbe6719b-2c38-49d7-8c96-622fc6900207", + "metadata": {}, + "source": [ + "### 2.1 Import LangChain Packages and Setup Connection" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "e1ed50a5-adc2-4ab2-a4ca-ad9e487fa464", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_community.utilities import SQLDatabase\n", + "from langchain.chains import create_sql_query_chain\n", + "from langchain_openai import ChatOpenAI\n", + "\n", + "from langchain_community.tools.sql_database.tool import QuerySQLDataBaseTool\n", + "\n", + "from operator import itemgetter\n", + "\n", + "from langchain_core.output_parsers import StrOutputParser\n", + "from langchain_core.prompts import PromptTemplate\n", + "from langchain_core.runnables import RunnablePassthrough" + ] + }, + { + "cell_type": "markdown", + "id": "ef46f940-9e27-4e49-b84a-41001ba9a79d", + "metadata": {}, + "source": [ + "### 2.2 Create the SQL Agent and a Chain" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "c44bb470-9afd-4fcd-bf67-c3cc08ddfcb9", + "metadata": {}, + "outputs": [], + "source": [ + "# Test connection to the database\n", + "db = SQLDatabase.from_uri(\"sqlite:///mydatabase.db\")\n", + "llm = ChatOpenAI(model=\"gpt-3.5-turbo\", temperature=0)\n", + "\n", + "execute_query = QuerySQLDataBaseTool(db=db)\n", + "write_query = create_sql_query_chain(llm, db)\n", + "chain = write_query | execute_query" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "2da6cdd0-012b-4fb2-8ae3-bde91d67aa20", + "metadata": {}, + "outputs": [], + "source": [ + "answer_prompt = PromptTemplate.from_template(\n", + " \"\"\"Given the following user question, corresponding SQL query, and SQL result, answer the user question.\n", + "\n", + "Question: {question}\n", + "SQL Query: {query}\n", + "SQL Result: {result}\n", + "Answer: \"\"\"\n", + ")\n", + "\n", + "answer = answer_prompt | llm | StrOutputParser()\n", + "chain = (\n", + " RunnablePassthrough.assign(query=write_query).assign(\n", + " result=itemgetter(\"query\") | execute_query\n", + " )\n", + " | answer\n", + ")\n" + ] + }, + { + "cell_type": "markdown", + "id": "6f39f651-c3e4-4fa7-b67a-b6b8c91de57f", + "metadata": {}, + "source": [ + "## 3. Chat with the Data" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "2ff768f4-7622-43db-afbe-4155bf5eeff2", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n", + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n" + ] + }, + { + "data": { + "text/plain": [ + "'There are 33 districts in Malawi.'" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n" + ] + } + ], + "source": [ + "user_question = \"How many districts are there in Malawi\"\n", + "chain.invoke({\"question\": \"{}\".format(user_question)})" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "b6d13719-e60e-4cff-afdb-c76253a65fc1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "32" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# VERIFY THIS INFORMATION USING PYTHON\n", + "df_pop.DistrictName.nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "a6305316-c437-4e86-bb3a-8f75450897e6", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n", + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n" + ] + }, + { + "data": { + "text/plain": [ + "'There are 9,042,289 women in Malawi.'" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n" + ] + } + ], + "source": [ + "user_question = \"How many women are there in Malawi\"\n", + "chain.invoke({\"question\": \"{}\".format(user_question)})" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "23e0abae-b0af-4965-9c37-00cafc30db5d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "9042289.0" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# VERIFY THIS INFORMATION USING PYTHON\n", + "df_pop.PopulationFemale.sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "e18bb38b-6313-4fd8-b2b8-b079c0e21e31", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n", + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n" + ] + }, + { + "data": { + "text/plain": [ + "'There are 246,415 women in Salima district.'" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n" + ] + } + ], + "source": [ + "user_question = \"How many women are there in Salima district\"\n", + "chain.invoke({\"question\": \"{}\".format(user_question)})" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "d1f8a6ca-6d53-4eca-80ae-09eb07ad886f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "246415.0" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# We can check that the answer above is correct using Python code\n", + "df_pop.query('DistrictName == \"Salima\"')['PopulationFemale'].sum()" + ] + }, + { + "cell_type": "markdown", + "id": "358519f6-98e3-4d45-ba6d-96560c67dcab", + "metadata": {}, + "source": [ + "### Complicated question" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "96f94c49-1d06-4765-8af1-12b2d217b8da", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n", + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n" + ] + }, + { + "data": { + "text/plain": [ + "'Approximately 51.48% of the population in Malawi is female.'" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n" + ] + } + ], + "source": [ + "user_question = \"What percent of the population is female in Malawi?\"\n", + "chain.invoke({\"question\": \"{}\".format(user_question)})" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "2033ecd1-0714-4bb0-a582-7dde4c366364", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "51.482681744085504" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fem = df_pop.PopulationFemale.sum()\n", + "tot = df_pop.TotalPopulation.sum()\n", + "\n", + "fem/tot*100" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "c267ea6e-6028-4716-9731-30beabf8b3f1", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n", + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n" + ] + }, + { + "data": { + "text/plain": [ + "'Based on the SQL query and result provided, we are only retrieving the population of males in the specified region (Central, Ntchisi, TA Malenga) for the last four years. We are not directly comparing the number of men over the years to determine if they are increasing. To answer the user question accurately, we would need to retrieve the population data for men in Malawi over the last four years and compare the numbers to see if there is an increase.'" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n" + ] + } + ], + "source": [ + "user_question = \"are the number of men increasing in the four last years in malawi\"\n", + "chain.invoke({\"question\": \"{}\".format(user_question)})" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "986fd189-6019-4a8a-bf3a-4fdc4f6e708e", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n", + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n" + ] + }, + { + "data": { + "text/plain": [ + "'The fertility rate of Malawi can be calculated by dividing the total female population by the total population. \\n\\nFor the first set of data:\\nFertility rate = total_female_population / total_population\\nFertility rate = 1303 / 2604\\nFertility rate = 0.5008\\n\\nFor the second set of data:\\nFertility rate = total_female_population / total_population\\nFertility rate = 9042289 / 17563749\\nFertility rate = 0.5143\\n\\nTherefore, the fertility rate of Malawi is approximately 0.5008 for the first set of data and 0.5143 for the second set of data.'" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n" + ] + } + ], + "source": [ + "user_question = \"what is the fertilely rate of Malawi(Calculate)?\"\n", + "chain.invoke({\"question\": \"{}\".format(user_question)})" + ] + }, + { + "cell_type": "markdown", + "id": "8e977b26-373b-4482-b613-bd1f70276b6f", + "metadata": {}, + "source": [ + "## 4. EXERCISE: What Question Do You Want Me to Try?\n", + "Share any question in the chat you would like me to try based on this dataset so that we see how much it can handle. \n", + "\n", + "- **Share your question on the chat**\n", + "- **I will run the question here and we will inspect the response together**" + ] + }, + { + "cell_type": "markdown", + "id": "c07e2bc8-c41c-49dd-ac6b-1a77d6c6e168", + "metadata": {}, + "source": [ + "## 5. What We will Do During the Course\n", + "During the course we will use LangChain to build our own **Ask-A-Question (AAQ)** type \n", + "of Chatbot to enable a user to chat with a dataset by asking natural language questions. \n", + "We will build an interactive app like [this](https://llm-examples.streamlit.app) using Streamlit and be able to share it with others." + ] + }, + { + "cell_type": "markdown", + "id": "3b84bb26-3dff-4d1b-af02-8edc18ac2f36", + "metadata": {}, + "source": [ + "# Deployment\n", + "1. Web app\n", + "2. WhatsApp \n", + "2. Chatbot on website of NSO or Health ministry" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "72accb41-83da-4781-bf06-2488db5f91d1", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/malawi-nov-24/2-document-classification-with-sklearn.ipynb b/notebooks/malawi-nov-24/2-document-classification-with-sklearn.ipynb new file mode 100644 index 0000000..20c458b --- /dev/null +++ b/notebooks/malawi-nov-24/2-document-classification-with-sklearn.ipynb @@ -0,0 +1,575 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Building a Document Classification System\n", + "The NumPy (Numerical Python) library used for working iwith arrays, and the Scikit-learn library is a python library built on NumPy, SciPy and matplotlib for data analytics and machine learning. The NLTK (Natural Language Toolkit) provides access to over 50 corpora and lexical resources such as WordNet, along with a suite of text processing libraries for classification, tokenization, stemming, tagging, parsing, and semantic reasoning, wrappers for industrial-strength NLP libraries." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Ensuring that you have the necessary libraries\n", + "# !pip install nltk\n", + "# !pip install numpy\n", + "# !pip install scikit-learn" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import nltk\n", + "from nltk.corpus import reuters\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.svm import LinearSVC\n", + "from sklearn.metrics import accuracy_score, classification_report\n", + "\n", + "from sklearn.feature_extraction.text import CountVectorizer\n", + "from sklearn.naive_bayes import MultinomialNB" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1. Load your data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The Reuters-21578 dataset is one of the most widely used data collections for text categorization research. It is a collection of documents with news articles and the original corpus has 10,369 documents and a vocabulary of 29,930 word and has labeled categories such as \"earnings\", \"acquisitions\".. etc. You can read metadata about the dataset on [Hugging Face](https://huggingface.co/datasets/ucirvine/reuters21578)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package reuters to\n", + "[nltk_data] /Users/dunstanmatekenya/nltk_data...\n", + "[nltk_data] Package reuters is already up-to-date!\n" + ] + }, + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# download the dataset\n", + "nltk.download('reuters')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# Load the Reuters-21578 dataset\n", + "documents = reuters.fileids()\n", + "train_docs = list(filter(lambda doc: doc.startswith(\"train\"), documents))\n", + "test_docs = list(filter(lambda doc: doc.startswith(\"test\"), documents))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2. Prepare your data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Prepare the data by extracting the raw text and category labels for both the training and testing documents. Assumption is that each document has only one category label, so we take only the first category label for each document." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "# Prepare the data\n", + "train_data = [reuters.raw(doc_id) for doc_id in train_docs]\n", + "train_labels = [reuters.categories(doc_id)[0] for doc_id in train_docs]\n", + "test_data = [reuters.raw(doc_id) for doc_id in test_docs]\n", + "test_labels = [reuters.categories(doc_id)[0] for doc_id in test_docs]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Question-How many different classes are in the training data?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Explore some of the training examples" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Article content: COMPUTER TERMINAL SYSTEMS <CPML> COMPLETES SALE\n", + " Computer Terminal Systems Inc said\n", + " it has completed the sale of 200,000 shares of its common\n", + " stock, and warrants to acquire an additional one mln shares, to\n", + " <Sedio N.V.> of Lugano, Switzerland for 50,000 dlrs.\n", + " The company said the warrants are exercisable for five\n", + " years at a purchase price of .125 dlrs per share.\n", + " Computer Terminal said Sedio also has the right to buy\n", + " additional shares and increase its total holdings up to 40 pct\n", + " of the Computer Terminal's outstanding common stock under\n", + " certain circumstances involving change of control at the\n", + " company.\n", + " The company said if the conditions occur the warrants would\n", + " be exercisable at a price equal to 75 pct of its common stock's\n", + " market price at the time, not to exceed 1.50 dlrs per share.\n", + " Computer Terminal also said it sold the technolgy rights to\n", + " its Dot Matrix impact technology, including any future\n", + " improvements, to <Woodco Inc> of Houston, Tex. for 200,000\n", + " dlrs. But, it said it would continue to be the exclusive\n", + " worldwide licensee of the technology for Woodco.\n", + " The company said the moves were part of its reorganization\n", + " plan and would help pay current operation costs and ensure\n", + " product delivery.\n", + " Computer Terminal makes computer generated labels, forms,\n", + " tags and ticket printers and terminals.\n", + " \n", + "\n", + " n\\, Label: acq\n" + ] + } + ], + "source": [ + "print(\"Article content: {} n\\, Label: {}\".format(train_data[1], train_labels[1]))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3. Vectorizing the text data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Vectorize the text data using the TfidVectorizer from scikit-learn. TF-IDF is an abbreviation for Term Frequency Inverse Document Frequency. This is very common algorithm to transform text into a meaningful representation of numbers which is used to fit machine algorithm for prediction. \n", + "- Its worth noting that nowadays, this vectorization approach is not commonly used. We will cover **word embeddings** tomorrow which is a better approach to represent words as numbers because **vector embeddings** can capture semantic meanings better.\n", + "\n", + "For the sklearn TF-IDF vectorizer, you can learn more about it [here](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "# Vectorize the text data\n", + "vectorizer = TfidfVectorizer(stop_words=\"english\", max_features=1000)\n", + "X_train = vectorizer.fit_transform(train_data)\n", + "X_test = vectorizer.transform(test_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Question: What role are the ```stop words``` playing in the code above? You might have learned this from Prof. Mohamad Ali already." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4. Training a Linear Support Vector Machine (LinearSVC) classifier using the vectorized training data and corresponding label" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
LinearSVC()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "LinearSVC()" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Train the classifier\n", + "classifier = LinearSVC()\n", + "classifier.fit(X_train, train_labels)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "classifier." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 5. Evaluate the classifier used and calculate the accuracy score as well as some other metrics (Precision, Recall and F-1 score)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 0.876117919841007\n", + " precision recall f1-score support\n", + "\n", + " acq 0.95 0.96 0.96 719\n", + " alum 0.33 0.18 0.24 22\n", + " barley 1.00 0.71 0.83 14\n", + " bop 0.77 0.80 0.79 30\n", + " carcass 0.79 0.65 0.71 17\n", + " castor-oil 0.00 0.00 0.00 1\n", + " cocoa 0.94 1.00 0.97 17\n", + " coconut 0.00 0.00 0.00 2\n", + " coconut-oil 0.00 0.00 0.00 2\n", + " coffee 0.89 0.96 0.92 25\n", + " copper 0.93 0.93 0.93 15\n", + " corn 0.85 0.81 0.83 48\n", + " cotton 1.00 0.86 0.92 14\n", + " cpi 0.62 0.62 0.62 24\n", + " cpu 0.00 0.00 0.00 1\n", + " crude 0.79 0.93 0.86 182\n", + " dfl 0.00 0.00 0.00 1\n", + " dlr 0.70 0.72 0.71 43\n", + " dmk 0.00 0.00 0.00 1\n", + " earn 0.98 0.99 0.98 1083\n", + " fuel 1.00 0.22 0.36 9\n", + " gas 0.75 0.33 0.46 9\n", + " gnp 0.59 0.89 0.71 19\n", + " gold 0.96 0.96 0.96 26\n", + " grain 0.71 0.77 0.74 77\n", + " groundnut 0.00 0.00 0.00 3\n", + " heat 1.00 0.75 0.86 4\n", + " hog 1.00 0.50 0.67 4\n", + " housing 1.00 0.67 0.80 3\n", + " income 1.00 0.80 0.89 5\n", + " instal-debt 1.00 1.00 1.00 1\n", + " interest 0.78 0.76 0.77 124\n", + " ipi 1.00 1.00 1.00 11\n", + " iron-steel 0.69 0.64 0.67 14\n", + " jet 0.00 0.00 0.00 1\n", + " jobs 0.73 0.85 0.79 13\n", + " l-cattle 0.00 0.00 0.00 2\n", + " lead 0.83 0.42 0.56 12\n", + " lei 1.00 1.00 1.00 3\n", + " livestock 0.50 0.50 0.50 6\n", + " lumber 0.00 0.00 0.00 5\n", + " meal-feed 0.20 0.17 0.18 6\n", + " money-fx 0.65 0.65 0.65 96\n", + " money-supply 0.80 0.83 0.81 29\n", + " naphtha 0.00 0.00 0.00 1\n", + " nat-gas 0.64 0.54 0.58 13\n", + " nickel 0.00 0.00 0.00 1\n", + " oilseed 0.54 0.54 0.54 13\n", + " orange 0.75 0.33 0.46 9\n", + " palladium 0.00 0.00 0.00 1\n", + " palm-oil 0.67 1.00 0.80 4\n", + " pet-chem 1.00 0.50 0.67 6\n", + " platinum 0.00 0.00 0.00 3\n", + " potato 1.00 0.67 0.80 3\n", + " propane 0.00 0.00 0.00 2\n", + " rape-oil 0.00 0.00 0.00 1\n", + " reserves 1.00 0.64 0.78 14\n", + " retail 1.00 1.00 1.00 1\n", + " rice 0.00 0.00 0.00 1\n", + " rubber 0.69 1.00 0.82 9\n", + " ship 0.39 0.41 0.40 39\n", + " silver 0.00 0.00 0.00 0\n", + " soy-oil 0.00 0.00 0.00 2\n", + " soybean 0.00 0.00 0.00 2\n", + "strategic-metal 0.00 0.00 0.00 6\n", + " sugar 0.71 0.96 0.81 25\n", + " tea 0.00 0.00 0.00 3\n", + " tin 0.71 0.50 0.59 10\n", + " trade 0.70 0.93 0.80 76\n", + " veg-oil 0.54 0.64 0.58 11\n", + " wpi 0.62 0.56 0.59 9\n", + " yen 0.00 0.00 0.00 6\n", + " zinc 0.00 0.00 0.00 5\n", + "\n", + " accuracy 0.88 3019\n", + " macro avg 0.53 0.48 0.49 3019\n", + " weighted avg 0.86 0.88 0.87 3019\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n", + "/Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n", + "/Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n", + "/Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n", + "/Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n", + "/Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n" + ] + } + ], + "source": [ + "# Evaluate the classifier\n", + "y_pred = classifier.predict(X_test)\n", + "accuracy = accuracy_score(test_labels, y_pred)\n", + "print(\"Accuracy:\", accuracy)\n", + "print(classification_report(test_labels, y_pred))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 6. Classify new documents (new BBC headlines) by vectorizing them using the same TfidfVectorizer and predicting their labels using the trained classifier" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Predicted labels: ['ship' 'ship' 'acq']\n" + ] + } + ], + "source": [ + "# Classify new documents (recent headlines obtained from BBC news regarding Tunisia)\n", + "new_docs = [\n", + " \"Tunisia says 23 people missing in Mediterranean sea.\",\n", + " \"Tunisia officials arrested in dispute over flag display.\",\n", + " \"Tunisia lawyer arrested during live news broadcast.\"\n", + "]\n", + "new_docs_vectors = vectorizer.transform(new_docs)\n", + "predicted_labels = classifier.predict(new_docs_vectors)\n", + "print(\"Predicted labels:\", predicted_labels)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Discussion" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "How did this classifier fare? What can you do to improve the model?
\n", + "Ans: Experimenting with different preprocessing techniques, feature extraction models and classification algorithms." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Trying with a different classifier" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Steps 1 - 3 will be the same." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load the Reuters-21578 dataset\n", + "documents = reuters.fileids()\n", + "train_docs = list(filter(lambda doc: doc.startswith(\"train\"), documents))\n", + "test_docs = list(filter(lambda doc: doc.startswith(\"test\"), documents))\n", + "\n", + "# Prepare the data\n", + "train_data = [reuters.raw(doc_id) for doc_id in train_docs]\n", + "train_labels = [reuters.categories(doc_id)[0] for doc_id in train_docs]\n", + "test_data = [reuters.raw(doc_id) for doc_id in test_docs]\n", + "test_labels = [reuters.categories(doc_id)[0] for doc_id in test_docs]\n", + "\n", + "# Vectorize the text data\n", + "vectorizer = CountVectorizer(stop_words=\"english\", max_features=1000)\n", + "X_train = vectorizer.fit_transform(train_data)\n", + "X_test = vectorizer.transform(test_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Different Classifier (Multinomial Naive Bayes)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "classifier = MultinomialNB()\n", + "classifier.fit(X_train, train_labels)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Evaluate the classifier\n", + "y_pred = classifier.predict(X_test)\n", + "accuracy = accuracy_score(test_labels, y_pred)\n", + "print(\"Accuracy:\", accuracy)\n", + "print(classification_report(test_labels, y_pred))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Classify new documents (recent headlines obtained from BBC news regarding Tunisia)\n", + "new_docs = [\n", + " \"Tunisia says 23 people missing in Mediterranean sea.\",\n", + " \"Tunisia officials arrested in dispute over flag display.\",\n", + " \"Tunisia lawyer arrested during live news broadcast.\"\n", + "]\n", + "new_docs_vectors = vectorizer.transform(new_docs)\n", + "predicted_labels = classifier.predict(new_docs_vectors)\n", + "print(\"Predicted labels:\", predicted_labels)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Discussion: Compare the results" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The choice of classifier depends on the specific characteristics of your dataset and the problem at hand. Multinomial Naive Bayes is known to work well with text data and can handle high-dimensional feature spaces efficiently. However, it assumes that the features are independent of each other, which may not always be the case in real-world scenarios.\n", + "\n", + "You can also experiment with different classifiers, such as Logistic Regression, Random Forest, or Gradient Boosting, and compare their performance to find the best fit for your dataset. You can also refine the model by trying different feature extraction techniques and hyperparameters." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### There are also other ways you can approach this, for example, Document Classification using BERT. Here is a notebook example on Kaggle that you can explore: https://www.kaggle.com/code/merishnasuwal/document-classification-using-bert" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "BERT (Bidirectional Encoder Representations from Transformers) and other Transformer encoder architectures can also be used on a variety of tasks in NLP (natural language processing). They compute vector-space representations of natural language that are suitable for use in deep learning models. The BERT family of models uses the Transformer encoder architecture to process each token of input text in the full context of all tokens before and after. BERT models are usually pre-trained on a large corpus of text, then fine-tuned for specific tasks." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.8" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/malawi-nov-24/3-intro-langchain.ipynb b/notebooks/malawi-nov-24/3-intro-langchain.ipynb new file mode 100644 index 0000000..82950fc --- /dev/null +++ b/notebooks/malawi-nov-24/3-intro-langchain.ipynb @@ -0,0 +1,1898 @@ +{ + "cells": [ + { + "attachments": { + "7153af0c-fb8b-4b47-826e-57ac60696e0c.png": { + "image/png": "" + }, + "faf11697-6be8-49bc-ab24-b3c4385b8a67.png": { + "image/png": "" + } + }, + "cell_type": "markdown", + "id": "740ffa74-4eda-4843-9b5b-486caab1153b", + "metadata": { + "tags": [] + }, + "source": [ + "# Introduction to LangChain\n", + "---------\n", + "![image.png](attachment:faf11697-6be8-49bc-ab24-b3c4385b8a67.png)![image.png](attachment:7153af0c-fb8b-4b47-826e-57ac60696e0c.png)\n", + "\n", + "**DIHPA'24**\n", + "\n", + "**Author:** Dunstan Matekenya \n", + "\n", + "**Affiliation:** DECAT, The World Bank Group \n", + "\n", + "**Date:** May 30, 2024\n", + "\n", + "\n", + "## What you will learn \n", + "In this notebook, you will learn the basics of the LangChain platform as follows.\n", + "1. **LLM capabilities.** Explore LLM capabilities using LangChain\n", + "2. **Interacting with LLMs.** Use LangChain functions such as chains, prompt templates and more to connect to LLMs\n", + "3. **RAG.**. Implementing a simple RAG in Langchain by connecting to external documents\n", + "4. **LangChain Expression Language (LCEL).**. How to use LCEL instead of functions when interacting with LLMs\n", + "5. **LangChain Agents.**. \n", + "\n", + "## Expected Broad Learning Outcomes\n", + "1. **Connecting to LLMs.** An understanding of how to connect to varios open source and proprietary LLMs using Hugging Face and proprietary specific frameworks such as that for OpenAI and Mistral\n", + "2. **Different LLMs.**. There are many varieties of LLMs: ```chat, instruct, question-answer, sentiment-analysis, instruct``` and more. Have basic understanding of differences across these models and when to use which one.\n", + "3. **The role of memory in Chat models.** Understand the importance of having memory in a chatbot and different strategies for doing it with LangChain.\n", + "4. **The process of implementing RAG in LangChain**. RAG is one of the most commonly used approach for implementing chats as it enables connection to external custom data. Have a good understanding of the main steps involved in implementing a RAG based system-the steps are the same in LangChain and other frameworks.\n", + "5. **Understand the role vector databases.** Vector databases are an integral part of working with LLMs. make sure you understand how they fit in the ecosystem and why they are important." + ] + }, + { + "cell_type": "markdown", + "id": "633b7017-2001-4cec-b34d-30a5bc4b92fc", + "metadata": {}, + "source": [ + "# Setup\n", + "\n", + "------" + ] + }, + { + "cell_type": "markdown", + "id": "7a131a8d-40d2-4bf3-9856-103ed70000d7", + "metadata": {}, + "source": [ + "## Import Packages\n", + "We will import packages as we go so that you appreciate which class we are using." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "b9a13ee9-f3d9-4141-a1a2-929cdc1b5113", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from pathlib import Path" + ] + }, + { + "cell_type": "markdown", + "id": "1e1dad1b-4014-48e9-b911-2095c9864a84", + "metadata": {}, + "source": [ + "## Setup API Keys" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "b443a7a8-9dd7-4320-958e-cc3376e09cb4", + "metadata": {}, + "outputs": [], + "source": [ + "# ====================\n", + "# Setup API Keys\n", + "# ====================\n", + "# Although its not recommended for security, you can also just \n", + "# paste your API keys \n", + "#OPENAI_API_KEY\n", + "#HUGGINGFACEHUB_API_TOKEN" + ] + }, + { + "cell_type": "markdown", + "id": "61224418-685b-499c-96fe-568ba7993475", + "metadata": {}, + "source": [ + "## Setup input directories \n", + "Lets organize where our data is stored so that we can easily access it. Please refer to the slides for recommended folder setup. Copy and paste the full paths to your working folder in the variables below." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "a234a85f-b21b-4378-b3bd-f27feec67c36", + "metadata": {}, + "outputs": [], + "source": [ + "# Replace this folder with your working folder \n", + "DIR_WD = Path(\"/Users/dunstanmatekenya/Google Drive/My Drive/GenAI-Course/Mod2-LLM-Overview/\")\n", + "\n", + "# data folder\n", + "DIR_DATA = DIR_WD.joinpath(\"data\")\n", + "\n", + "# We can also set file names for data files we will use to save time\n", + "FILE_HEP_CHAD = DIR_DATA.joinpath(\"Hepatitis-Chad.pdf\")\n", + "\n", + "FILE_MIDDLE_EAST_COVID = DIR_DATA.joinpath(\"MidEast-COVID.pdf\")\n", + "\n", + "FILE_DENGUE = DIR_DATA.joinpath(\"Dengue-Global-situation.pdf\")" + ] + }, + { + "cell_type": "markdown", + "id": "2f148739-53c8-4cb8-b477-da5781e8195d", + "metadata": {}, + "source": [ + "# 1. Exploring Language Tasks that LLMs can Perform\n", + "In this section, we will explore what type of NLP tasks LLMs can perfom using the Hugging Face transformer package. In some cases, when we specifiy a specific model, the transformers package will take some time to download the model files. Also, the idea here is to show very simple capabilities. In a real world project, you can train and fine-tune the transformer models on your own dataset. For example, to do a fully fledged sentiment analysis with Hugging Face, take a look at [this tutorial] (https://huggingface.co/blog/sentiment-analysis-python).\n", + "\n", + ">Note that for almost all of these tasks, you can replace the English text with French text and still get similar results" + ] + }, + { + "cell_type": "markdown", + "id": "2dc053ef-5ba8-4557-b0f2-0975447d566e", + "metadata": {}, + "source": [ + "## 1. 1 Text and Document Classification\n", + "Text and document classification are closely related tasks. In **text classification**, we assign predefined categories to individual pieces of text while in **document classification** refers to the process of assigning predefined categories to longer pieces of text, such as entire documents, articles, or reports.\n", + "\n", + "- **Examples of text classification tasks**. Sentiment Analysis; Intent Detection;\n", + "- **Examples of document classification**. Topic categorization, " + ] + }, + { + "cell_type": "markdown", + "id": "860fcb52-c3f4-4c8f-8fb4-e2b4808114c2", + "metadata": {}, + "source": [ + "### Sentiment Analysis with the Hugging Face Transformers Library" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "499b7d29-d647-4cb9-ae00-fd2b88ae18b5", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).\n", + "Using a pipeline without specifying a model name and revision in production is not recommended.\n", + "/Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "POSITIVE\n" + ] + } + ], + "source": [ + "# We use transformers ```pipeline library\n", + "from transformers import pipeline\n", + "\n", + "llm = pipeline(\"text-classification\")\n", + "text = \"I'm really enjoying my stay in Tunis\"\n", + "outputs = llm(text)\n", + "print(outputs[0]['label'])" + ] + }, + { + "cell_type": "markdown", + "id": "affcc68c-49ea-4cfe-bc59-294854156360", + "metadata": {}, + "source": [ + "## 1.2 Text Generation\n", + "Text generation is a process in natural language processing (NLP) where a machine learning model generates coherent and contextually relevant text based on a given input or prompt. This technology is used in various applications such as chatbots, automated content creation, machine translation, and more.\n", + "\n", + "In real life, the text is not always coherent, based on the model, when we use a default model, the results are not good. " + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "0224c530-1943-481e-b4fc-92cfb2f62702", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "No model was supplied, defaulted to openai-community/gpt2 and revision 6c0e608 (https://huggingface.co/openai-community/gpt2).\n", + "Using a pipeline without specifying a model name and revision in production is not recommended.\n", + "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n", + "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Malawi is famous for urchin, the rice used in the national diet, and its high price and widespread lack of access to good sources of water are driving thousands of people into poverty. So far, the government has been able to bring in more than $2 billion through loans, while food aid has been limited in its expansion, for example by one million poor people trying to come back from war-ravaged country.\n", + "\n", + "As for the country's food safety, the government has been\n" + ] + } + ], + "source": [ + "llm = pipeline(\"text-generation\")\n", + "prompt = \"Malawi is famous for \"\n", + "outputs = llm(prompt, max_length=100)\n", + "print(outputs[0]['generated_text'])" + ] + }, + { + "cell_type": "markdown", + "id": "aabac32c-0547-44d4-b77b-34d16a2d8220", + "metadata": {}, + "source": [ + "**EXERCISE-0: Try to specify a different Hugging Face model and see if you get better results**" + ] + }, + { + "cell_type": "markdown", + "id": "b6bf9c35-35f0-4310-adff-6dbb7eab57f4", + "metadata": {}, + "source": [ + "## 1.3 Text Summarization\n", + "Text summarization is a natural language processing (NLP) task that involves creating a concise and coherent summary of a longer text document. The goal is to capture the most important information and main ideas while reducing the length of the original text. " + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "d0f4c46a-b896-47ce-928b-0c8b8bae063d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Walking amid Gion's Machiya wooden houses is a mesmerizing experience. The beautifullypreserved structures exuded an old-world charm that transports visitors back in time. The glow of lanterns lining the narrow streets add to theenchanting ambiance, making each stroll a\n" + ] + } + ], + "source": [ + "llm= pipeline(\"summarization\", model=\"facebook/bart-large-cnn\")\n", + "long_text = \"\"\"Walking amid Gion's Machiya wooden houses is a mesmerizing experience. The beautifully\n", + "preserved structures exuded an old-world charm that transports visitors back in time, making them feel\n", + "like they had stepped into a living museum. The glow of lanterns lining the narrow streets add to the\n", + "enchanting ambiance, making each stroll a memorable journey through Japan's rich cultural history.\n", + "\"\"\"\n", + "outputs = llm(long_text, max_length=60, clean_up_tokenization_spaces=True)\n", + "print(outputs[0]['summary_text'])" + ] + }, + { + "cell_type": "markdown", + "id": "58c10bc4-2237-45b2-a367-c7690a2586b4", + "metadata": {}, + "source": [ + "## 1.4 Question-Answering\n", + "Question Answering (QA) is one of the most common tasks or use casef for LLMs. In this task, the model is designed to automatically answer questions posed by humans in natural language. QA systems can be built to answer questions from a variety of sources, such as structured databases, knowledge bases, or unstructured text documents." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "a8aefbf4-a88a-4cdc-b882-eb0a5781200b", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "No model was supplied, defaulted to distilbert/distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad).\n", + "Using a pipeline without specifying a model name and revision in production is not recommended.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "wooden\n" + ] + } + ], + "source": [ + "llm = pipeline(\"question-answering\")\n", + "context = \"Walking amid Gion's Machiya wooden houses was a mesmerizing experience.\"\n", + "question = \"What are Machiya houses made of?\"\n", + "outputs = llm(question=question, context=context)\n", + "print(outputs['answer'])" + ] + }, + { + "cell_type": "markdown", + "id": "eea8c421-6a5d-426c-8f9f-fc94bc61b492", + "metadata": {}, + "source": [ + "## 1.5 Language Translation" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "cde7750a-4ab4-4a5d-9019-7dc7be065719", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "No model was supplied, defaulted to google-t5/t5-base and revision 686f1db (https://huggingface.co/google-t5/t5-base).\n", + "Using a pipeline without specifying a model name and revision in production is not recommended.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "C'est ma première visite en Tunisie.\n" + ] + } + ], + "source": [ + "llm = pipeline(\"translation_en_to_fr\")\n", + "text = \"This is my first time to visit Tunisia.\"\n", + "outputs = llm(text, clean_up_tokenization_spaces=True)\n", + "print(outputs[0]['translation_text'])" + ] + }, + { + "cell_type": "markdown", + "id": "d3e3d82d-f4ea-439b-9fd5-08ae2105f3a3", + "metadata": {}, + "source": [ + "# 2. Introducing LangChain Core Functionalities" + ] + }, + { + "cell_type": "markdown", + "id": "0d4c7bd8-c8cb-4e99-bef5-40144a82c78c", + "metadata": {}, + "source": [ + "It is always a good idea to read documentation of a framework. Please head over to [LangChain website](https://www.langchain.com) for details of core functionalities, use cases and features. The screenshot below provides a summary of LangChain ecosytem of features and capabilities. The term **Chain** in LangChain refers to the core concept of **chains** in LangChain which is a sequence(s) of calls - whether to an LLM, a tool, or a data preprocessing step. The primary supported way to do this is with LCEL (we will see this later)." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "b02bc8a4-ea90-4622-92d8-43861fcb12d2", + "metadata": {}, + "outputs": [], + "source": [ + "from IPython.display import Image" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "2e39d82c-7089-4aa4-8054-d01427f1c1b3", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "" + ] + }, + "execution_count": 11, + "metadata": { + "image/png": { + "width": 500 + } + }, + "output_type": "execute_result" + } + ], + "source": [ + "Image(filename='../images/LangChain-detailed.png', width=500) " + ] + }, + { + "cell_type": "markdown", + "id": "40dcce20-057f-4b5b-84b6-356a3c1db8a7", + "metadata": {}, + "source": [ + "## 2.1 Interacting with Models in LangChain \n", + "- General instruction models - Models which can answer questions but are not quite optmized for chat\n", + "- Chat models are more optimized for question and answering\n", + "- Prompting templates and techniques " + ] + }, + { + "cell_type": "markdown", + "id": "33a7bf7e-0ea4-431f-9f3e-23119e1a14a7", + "metadata": {}, + "source": [ + "### Trying out Open Vs. Proprietary Model\n", + "- **Accessing open source LLMs on Hugging Face.** In order to access open source LLMs from Hugging Face, you need two main inputs: ```Hugging Face token``` and the model id or url. Recall that you can explore and grab model details from the Hugging Face platform easily. Once you have that we can use ```HuggingFaceEndpoint``` or ```HuggingFaceHub``` to access and use the model.\n", + "\n", + "- **Accessing proprietary LLMs (e.g., OpenAI).** LangChain has specific packages for working with OpenAI models. For other providers such as Mistral, you need to check [LangChain documentation](https://python.langchain.com/v0.1/docs/integrations/chat/mistralai/)." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "0f911200-d6b2-47a2-b862-643f7f61bd83", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Can you still have fun in the rain?\n", + "Yes, you can still have fun in the rain! There are plenty of activities you can do indoors or outdoors, such as playing board games, reading a book, or going for a walk. You can also try to find creative ways to enjoy the rain, such as using a rain shower to take a bath or making a rain-soaked picnic. Just remember to stay safe and take precautions if necessary.\n" + ] + } + ], + "source": [ + "from langchain_community.llms import HuggingFaceEndpoint, HuggingFaceHub\n", + "\n", + "# Lets make this a global variable in case we want to use this model\n", + "# again\n", + "MODEL_ID_FALCON = 'tiiuae/falcon-7b-instruct'\n", + "\n", + "llm = HuggingFaceHub(repo_id=MODEL_ID_FALCON, \n", + " huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN)\n", + "\n", + "question = 'Can you still have fun'\n", + "output = llm.invoke(question)\n", + "print(output)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "b35f1964-5938-4773-b768-334df1551939", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n", + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " if you're dead inside?\n", + "\n", + "It is possible to have fun even if you feel dead inside. While feeling emotionally numb or disconnected can make it more challenging to enjoy activities or events, it is still possible to find moments of joy and pleasure.\n", + "\n", + "Here are some tips for having fun even if you feel dead inside:\n", + "\n", + "1. Engage in activities that have brought you joy in the past. Think back to activities or hobbies that you used to enjoy before you started feeling dead inside. Even if you don't feel the same level of excitement, engaging in these activities can still bring some enjoyment.\n", + "\n", + "2. Try something new. Sometimes, trying something new can help break out of a rut and bring some fun into your life. This could be a new hobby, sport, or even a new type of food.\n", + "\n", + "3. Spend time with loved ones. Being around people who care about you and make you feel loved and supported can help lift your mood and bring some fun into your life. Plan a fun outing or simply spend time talking and laughing with friends and family.\n", + "\n", + "4. Practice self-care. Taking care of yourself can help improve your overall mood and make it easier to have fun. Make time for activities that help you relax and recharge, such as taking a bath, reading a book,\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n" + ] + } + ], + "source": [ + "from langchain_openai import OpenAI\n", + "\n", + "# Note that we will be able to select specific OpenAI models \n", + "# If you have a paid account \n", + "llm = OpenAI(openai_api_key=OPENAI_API_KEY)\n", + "\n", + "question = 'Can you still have fun'\n", + "output = llm.invoke(question)\n", + "print(output)" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "e9bda8e1-dce1-49c1-b308-82063fa53e6a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "3544" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "2*1772" + ] + }, + { + "cell_type": "markdown", + "id": "7381c4a5-5a33-405c-b82a-baacfebe6e56", + "metadata": {}, + "source": [ + "**EXERCISE-1. Find another model on Hugging Face to try**\n", + "- Go to [Hugging Face](https://huggingface.co/models)\n", + "- Search for **Text Generation** LLMs. Note that large models can be hard and take long to run.\n", + "- Get the model Id\n", + "- Initialize the model, and ask it a question/prompt as we did with Falcon model above" + ] + }, + { + "cell_type": "markdown", + "id": "f4edd5e5-44fa-49b0-b150-64a580da8f66", + "metadata": {}, + "source": [ + "### . Prompt templates\n", + "Prompt templates are used for creating prompts in a more modular way, so they can be reused and built on. Chains act as the glue in LangChain; bringing the other components together into workflows that pass inputs and outputs between the different components\n", + "- They are recipes for generating prompts\n", + "- Flexible and modular\n", + "- Can contain: instructions, examples, and additional context" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "0a2faa69-863e-4a4e-9118-ba50e0e72586", + "metadata": {}, + "outputs": [ + { + "ename": "ValidationError", + "evalue": "1 validation error for HuggingFaceHub\ntoken\n extra fields not permitted (type=value_error.extra)", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mValidationError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[21], line 8\u001b[0m\n\u001b[1;32m 5\u001b[0m template \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mYou are an artificial intelligence assistant, answer the question. \u001b[39m\u001b[38;5;132;01m{question}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 6\u001b[0m prompt \u001b[38;5;241m=\u001b[39m PromptTemplate(template\u001b[38;5;241m=\u001b[39mtemplate, input_variables\u001b[38;5;241m=\u001b[39m[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mquestion\u001b[39m\u001b[38;5;124m\"\u001b[39m])\n\u001b[0;32m----> 8\u001b[0m llm \u001b[38;5;241m=\u001b[39m \u001b[43mHuggingFaceHub\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrepo_id\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mMODEL_ID_FALCON\u001b[49m\u001b[43m,\u001b[49m\u001b[43mtoken\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mHUGGINGFACEHUB_API_TOKEN\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 10\u001b[0m \u001b[38;5;66;03m# Create a Chain using the LLMChain() \u001b[39;00m\n\u001b[1;32m 11\u001b[0m llm_chain \u001b[38;5;241m=\u001b[39m LLMChain(prompt\u001b[38;5;241m=\u001b[39mprompt, llm\u001b[38;5;241m=\u001b[39mllm)\n", + "File \u001b[0;32m~/anaconda3/lib/python3.10/site-packages/pydantic/main.py:341\u001b[0m, in \u001b[0;36mpydantic.main.BaseModel.__init__\u001b[0;34m()\u001b[0m\n", + "\u001b[0;31mValidationError\u001b[0m: 1 validation error for HuggingFaceHub\ntoken\n extra fields not permitted (type=value_error.extra)" + ] + } + ], + "source": [ + "from langchain.prompts import PromptTemplate, ChatPromptTemplate\n", + "\n", + "# A String with instructions, same way we create prompts\n", + "# in GUI based interface such as chatGPT\n", + "template = \"You are an artificial intelligence assistant, answer the question. {question}\"\n", + "prompt = PromptTemplate(template=template, input_variables=[\"question\"])\n", + "\n", + "llm = HuggingFaceHub(repo_id=MODEL_ID_FALCON,token=HUGGINGFACEHUB_API_TOKEN)\n", + "\n", + "# Create a Chain using the LLMChain() \n", + "llm_chain = LLMChain(prompt=prompt, llm=llm)\n", + "question = \"What is LangChain?\"\n", + " \n", + "print(llm_chain.run(question))" + ] + }, + { + "cell_type": "markdown", + "id": "5a614f93-417e-4185-96c6-dc0b2ae1704d", + "metadata": {}, + "source": [ + "### Chat Models\n", + "Chat Models are a core component of LangChain. A chat model is a language model that uses chat messages as inputs and returns chat messages as outputs (as opposed to using plain text)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "684669c2-a2f7-4801-b9db-87b702b545e5", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages/langchain_core/_api/deprecation.py:119: LangChainDeprecationWarning: The method `BaseChatModel.__call__` was deprecated in langchain-core 0.1.7 and will be removed in 0.3.0. Use invoke instead.\n", + " warn_deprecated(\n", + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n" + ] + }, + { + "data": { + "text/plain": [ + "AIMessage(content='One of the best places to visit in Malawi is Lake Malawi. This stunning lake is known for its crystal-clear waters, beautiful beaches, and diverse marine life. Visitors can enjoy a variety of water activities such as snorkeling, diving, kayaking, and sailing. The lake is also surrounded by national parks and reserves, offering opportunities for wildlife viewing and hiking. Additionally, the lakeshore is dotted with charming villages where you can experience the local culture and hospitality. Overall, Lake Malawi is a must-visit destination for nature lovers and adventure seekers in Malawi.', response_metadata={'token_usage': {'completion_tokens': 116, 'prompt_tokens': 38, 'total_tokens': 154}, 'model_name': 'gpt-3.5-turbo', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-11f38ec0-7a4b-47d9-9080-855d38cf0f35-0', usage_metadata={'input_tokens': 38, 'output_tokens': 116, 'total_tokens': 154})" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n" + ] + } + ], + "source": [ + "from langchain_openai import ChatOpenAI\n", + "from langchain.prompts import PromptTemplate, ChatPromptTemplate\n", + "\n", + "llm = ChatOpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)\n", + "\n", + "prompt_template = ChatPromptTemplate.from_messages([\n", + "(\"system\", \"You are a helpful assistant who knows alot about Africa.\"),\n", + "(\"human\",\"Respond to the question: {question}\")]\n", + ")\n", + "\n", + "full_prompt = prompt_template.format_messages(question='What is the best place to visit in Malawi?')\n", + "llm(full_prompt)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e11ea305-a3c9-439f-b135-236e26c39ac1", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "c3e8c433-8672-4ae8-9579-66d5201bc657", + "metadata": {}, + "source": [ + "## 2.2. Managing chat model memory\n", + "- A key feature of chatbot applications is the ability to have a conversation, where context from the conversation is stored and available for the model to access for later questions or reference.\n", + "- Memory is important for conversations with chat models; it opens up the possibility of providing follow-up questions, of building and iterating on model responses, and for chatbots to adapt to the user's preferences and behaviors. \n", + "- Although LangChain allows us to customize and optimize in-conversation chatbot memory, it is still limited by the model's context window. \n", + "- An **LLM's context window** is the amount of input text the model can consider at once when generating a response, and the length of this window varies for different models. LangChain has a standard syntax for optimizing model memory. \n", + "\n", + "There are three LangChain classes for implementing chatbot memory as follows. \n", + "### The ```ChatMessageHistory``` Class\n", + "- The ChatMessageHistory class stores the full history of messages between the user and model. By providing this to the model, we can provide follow-up questions and iterate on the response message.\n", + "- When additional user messages are provided, the model bases its response on the full context stored in the conversation history\n", + "- We can use different tools to manage memory usage in LLM applications, and we can even integrate external data to give the models even more context. \n", + "\n", + "\n", + "### The ```ConversationBufferMemory``` class\n", + "- This gives the application a rolling buffer memory containing the last few messages in the conversation. Users can specify the number of messages to store with the size argument, and the application will discard older messages as newer ones are added. \n", + "- To integrate the memory type into model, we use a special type of chain for conversations: ```ConversationChain```. \n", + "\n", + "### The ```ConversationSummaryMemory``` class\n", + "- Summarizing important points from a conversation can also be a good way of optimizing memory. The ConversationSummaryMemory class summarizes the conversation over time, condensing the information. \n", + "- This means that the chat model can remember key pieces of context without needing to store and process the entire conversation history" + ] + }, + { + "cell_type": "markdown", + "id": "e6b7f3d4-afdd-422c-8688-7a31cb79bb26", + "metadata": {}, + "source": [ + "### Trying out the ChatMessageHistory class" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a98c8f82-8e9a-4603-811b-c20d034ee6b4", + "metadata": {}, + "outputs": [], + "source": [ + "chat = ChatOpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)\n", + "history = ChatMessageHistory()\n", + "history.add_ai_message(\"Hi! Ask me anything please.\")\n", + "history.add_user_message(\"Describe a metaphor for learning LangChain in one sentence.\")\n", + "chat(history.messages)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1248fd89-60d3-4d49-a276-f419417f8e88", + "metadata": {}, + "outputs": [], + "source": [ + "# Ask a question based on the previous messages \n", + "history.add_user_message(\"Summarize the preceding sentence in fewer words\")\n", + "chat(history.messages)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5eeeaed5-cce7-4261-a852-eee1941c808b", + "metadata": {}, + "outputs": [], + "source": [ + "# Ask a question based on the previous messages \n", + "history.add_user_message(\"Summarize the preceding sentence in fewer words\")\n", + "chat(history.messages)" + ] + }, + { + "cell_type": "markdown", + "id": "6e1b5fb0-9fbf-4836-94d0-4017efbdfae0", + "metadata": {}, + "source": [ + "### Trying out the ConversationBufferMemory\n", + "For many applications, storing and accessing the entire conversation history isn't technically feasible. In these cases, the messages must be condensed while retaining as much relevant context as possible. One common way of doing this is with a memory buffer, which stores only the most recent messages based on the parameter ```size```." + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "c8742c04-33db-42fb-8509-987dee9e61d0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\u001b[1m> Entering new ConversationChain chain...\u001b[0m\n", + "Prompt after formatting:\n", + "\u001b[32;1m\u001b[1;3mThe following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know.\n", + "\n", + "Current conversation:\n", + "\n", + "Human: Describe a language model in one sentence\n", + "AI:\u001b[0m\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\u001b[1m> Finished chain.\u001b[0m\n", + "\n", + "\n", + "\u001b[1m> Entering new ConversationChain chain...\u001b[0m\n", + "Prompt after formatting:\n", + "\u001b[32;1m\u001b[1;3mThe following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know.\n", + "\n", + "Current conversation:\n", + "Human: Describe a language model in one sentence\n", + "AI: A language model is a statistical model that is trained on a large corpus of text and is able to generate coherent and grammatically correct sentences based on the patterns and structures it has learned.\n", + "Human: Describe it again using less words\n", + "AI:\u001b[0m\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\u001b[1m> Finished chain.\u001b[0m\n", + "\n", + "\n", + "\u001b[1m> Entering new ConversationChain chain...\u001b[0m\n", + "Prompt after formatting:\n", + "\u001b[32;1m\u001b[1;3mThe following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know.\n", + "\n", + "Current conversation:\n", + "Human: Describe a language model in one sentence\n", + "AI: A language model is a statistical model that is trained on a large corpus of text and is able to generate coherent and grammatically correct sentences based on the patterns and structures it has learned.\n", + "Human: Describe it again using less words\n", + "AI: A language model is a computer program that can generate sentences based on patterns it has learned from a large amount of text.\n", + "Human: Describe it again fewer words but at least one word\n", + "AI:\u001b[0m\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\u001b[1m> Finished chain.\u001b[0m\n", + "\n", + "\n", + "\u001b[1m> Entering new ConversationChain chain...\u001b[0m\n", + "Prompt after formatting:\n", + "\u001b[32;1m\u001b[1;3mThe following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know.\n", + "\n", + "Current conversation:\n", + "Human: Describe a language model in one sentence\n", + "AI: A language model is a statistical model that is trained on a large corpus of text and is able to generate coherent and grammatically correct sentences based on the patterns and structures it has learned.\n", + "Human: Describe it again using less words\n", + "AI: A language model is a computer program that can generate sentences based on patterns it has learned from a large amount of text.\n", + "Human: Describe it again fewer words but at least one word\n", + "AI: A language model is a program that generates sentences from text patterns.\n", + "Human: What did I first ask you? I forgot.\n", + "AI:\u001b[0m\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\u001b[1m> Finished chain.\u001b[0m\n" + ] + }, + { + "data": { + "text/plain": [ + "' You asked me to describe a language model in one sentence.'" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n" + ] + } + ], + "source": [ + "from langchain.memory import ChatMessageHistory, ConversationBufferMemory, ConversationSummaryMemory\n", + "from langchain.chains import LLMChain, ConversationChain, RetrievalQA, RetrievalQAWithSourcesChain\n", + "# Create an Open AI Chat Model\n", + "chat = OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)\n", + "\n", + "# Create the memory object with size set to 2\n", + "memory = ConversationBufferMemory(size=4)\n", + "buffer_chain = ConversationChain(llm=chat, memory=memory, verbose=True)\n", + "\n", + "# \n", + "buffer_chain.predict(input=\"Describe a language model in one sentence\")\n", + "buffer_chain.predict(input=\"Describe it again using less words\")\n", + "buffer_chain.predict(input=\"Describe it again fewer words but at least one word\")\n", + "buffer_chain.predict(input=\"What did I first ask you? I forgot.\")" + ] + }, + { + "cell_type": "markdown", + "id": "efd52d16-a392-4aad-83e7-2e044b6d0c43", + "metadata": {}, + "source": [ + "**EXERCISE-2. For the ```ConversationBufferMemory```, change the buffer size to 1 or 2 and see what happens**" + ] + }, + { + "cell_type": "markdown", + "id": "8e535976-cabe-4162-b8ca-e84532dc783c", + "metadata": {}, + "source": [ + "## ConversationSummaryMemory\n", + "For longer conversations, storing the entire memory, or even a long buffer memory, may not be technically feasible. In these cases, a summary memory implementation can be a good option. Summary memories summarize the conversation at each step to retain the key context for the model to use. This works by using another LLM for generating the summaries, alongside the LLM used for generating the responses." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e7325752-2d3d-499e-b7ae-31122b1f64d5", + "metadata": {}, + "outputs": [], + "source": [ + "# ==============================================\n", + "# PLEASE FOLLOW INSTRUCTIONS AND COMPLETE CODE\n", + "# ==============================================\n", + "\n", + "# Use openAI model from earlier as a summary model\n", + "summary_llm = YOUR CODE HERE\n", + "\n", + "# Complete code below by putting in summary model above\n", + "memory = ConversationSummaryMemory(llm=summary_llm)\n", + "\n", + "# Create a chat model to use in the Conversation chain below (refer\n", + "# previous cells where we created OpenAI chat model\n", + "chat_model = YOUR CODE HERE\n", + "\n", + "# Create a conversation chain as we did before \n", + "summary_chain = YOUR CODE HERE\n", + "\n", + "summary_chain.predict(input=\"Please tell me about Malawi.\")\n", + "summary_chain.predict(input=\"Does that affect Malawi's income?\")" + ] + }, + { + "cell_type": "markdown", + "id": "f4c250ad-6f8a-42a6-99d5-bdb7b75df1e1", + "metadata": {}, + "source": [ + "# 3. Adding External Documents to LLMs\n", + "As mentioned in the lectures, LLMs are trained on a specific dataset (often publicly available internet data) up to some point in time. Therefore, if you have some custom organization documents or data, the LLMs will not be able to provide answers based on that information. Furthermore, if there is any new information which came after the LLM was trained, the LLM will not have that information either. \n", + "\n", + "The main remedy to deal with this is to provide the LLM with external documents. Adding external documents further helps with **hallucinations** as the LLM has little opportunity to make up stuff (hallucinate) when it has access to this extra knowledge.\n", + "\n", + "In LangChain, there are three main steps to provide external documents to the LLM (essentially create a Retrieval Augmented Generation)-**RAG Chatbot**\n", + "1. Identify the data sources (documents, datasets, websites, databases etc).\n", + "\n", + "2. Load the documents into LangChain using document loaders. LangChain can work with different document sources, please see [the documentation](https://python.langchain.com/v0.1/docs/integrations/document_loaders/). \n", + "\n", + "3. Splitting the documents into chunks. \n", + "\n", + "4. Create vector embeddings and store into a vector database for retrievval" + ] + }, + { + "cell_type": "markdown", + "id": "8eb4a208-8c25-4060-bf4b-c4eb06e26557", + "metadata": {}, + "source": [ + "### 3.1 Document Loaders\n", + "LangChain has more than 160 document loaders. Some loaders are provided by 3rd parties who manage unique document formats. These include Amazon S3, Microsoft, Google Cloud, Jupyter notebooks, pandas DataFrames, unstructured HTML, YouTube audio transcripts, and more. " + ] + }, + { + "cell_type": "markdown", + "id": "9efebd7e-59c0-4aef-81a3-5d9a500d1319", + "metadata": {}, + "source": [ + "#### PDF Document Loader\n", + "- Requires installation of the ```pypdf``` package as a dependency.\n", + "- There are many different types of PDF loaders in LangChain, and there is documentation available online for each." + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "fc21c844-25b1-4447-a7ea-4aff7ad450ed", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: pypdf in /Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages (3.8.1)\n" + ] + } + ], + "source": [ + "!pip install pypdf" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "417c534c-4a01-49a4-9c8f-16450dec011a", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import PyPDFLoader\n", + "loader = PyPDFLoader(str(FILE_DENGUE))\n", + "data = loader.load()\n", + "print(data[0])" + ] + }, + { + "cell_type": "markdown", + "id": "bdb6fbaf-34c0-4fc6-8570-76aa853e78a5", + "metadata": {}, + "source": [ + "**EXERCISE-3. Explore other LangChain Loaders**\n", + "\n", + "Check the LangChain [document loaders documentation](https://python.langchain.com/v0.1/docs/integrations/document_loaders/) \n", + "and also check [here](https://python.langchain.com/v0.1/docs/modules/data_connection/) for most commonly used loaders.\n", + "1. Identify 5 document loaders you find interesting. What are third party document loaders?\n", + "2. **HTML loaders**. Explore the html or webpage loaders. \n", + "3. Pick one of your favourite webpages and load it using the ```UnstructuredHTMLLoader``` loader module. Refer to the [documentation](UnstructuredHTMLLoader) on how to import the module.\n", + "4. How do you think this changes your approach to ```web-scraping```. Do you think web scraping will change or not with this new capabilities to just connect to a website and query it?" + ] + }, + { + "cell_type": "markdown", + "id": "07bde0cf-56cb-4726-a3f0-cfc839ba1d3e", + "metadata": {}, + "source": [ + "### 3.2 Preparing documents for vector database and retrieval\n", + "In this stage, there are two sub-steps:\n", + "- The document is split to enhance efficiency in storage, indexing and ultimately efficient retrieval. Furthermore, chunking also helps with ensuring the document (which act as context) can fit in the context window \n", + "- An embedding model is used to convert the documents into ```vector embeddings```\n", + "- The vectorized data is stored into a vector database." + ] + }, + { + "cell_type": "markdown", + "id": "ba116fde-3896-40d2-b921-18c2de13b56d", + "metadata": {}, + "source": [ + "#### Splitting/Chunking Documents\n", + "- Given a PDF document, one naive splitting option would be to separate the document into lines as they appear in the document. This would be simple to implement but could be problematic. Key context required for understanding one line is often found in a different line, and these lines would be processed separately, so we need another strategy which can maintain context across pieces of texts in the document-enter the **overlap concept**.\n", + "We will compare two document splitting methods from LangChain. \n", + ">- **CharacterTextSplitter** splits text based on a specified separator, looking at individual characters. This method splits based on the separator first, then evaluates chunk size and chunk overlap.\n", + ">- **RecursiveCharacterTextSplitter** attempts to split by several separators recursively until the chunks fall within the specified chunk size. There are many other methods that use natural language processing to infer meaning and split appropriately. Optimizing this is an active area of research.\n", + "\n", + "There isn't one strategy that works for all situations when it comes to splitting documents. \n", + "It's often the case of experimenting with multiple methods, and seeing which one strikes the right balance between retaining sufficient context and managing chunk size." + ] + }, + { + "cell_type": "markdown", + "id": "633a6334-4e51-4e28-b2a9-1967fd36d6b7", + "metadata": {}, + "source": [ + "##### CharacterTextSplitter" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "9cabe5b1-557f-4480-83ef-9637682546c5", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Created a chunk of size 52, which is longer than the specified 24\n" + ] + } + ], + "source": [ + "from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter\n", + "quote = 'One machine can do the work of fifty ordinary humans.\\\n", + "No machine can do the work of one extraordinary human.'\n", + "\n", + "chunk_size = 24\n", + "chunk_overlap = 3\n", + "\n", + "ct_splitter = CharacterTextSplitter(separator=\".\", \n", + " chunk_overlap=chunk_overlap, chunk_size=chunk_size)\n", + "\n", + "docs = ct_splitter.split_text(quote)" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "2a6a8893-4f70-40db-a1a2-66055f532d3d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['One machine can do the work of fifty ordinary humans',\n", + " 'No machine can do the work of one extraordinary human']" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "docs" + ] + }, + { + "cell_type": "markdown", + "id": "08337229-f503-429b-8345-e5d987f0d774", + "metadata": {}, + "source": [ + "##### RecursiveCharacterTextSplitter" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "431f2615-19d8-45a7-b626-f78e45332534", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['One machine can do the', 'work of fifty ordinary', 'humans.No machine can', 'do the work of one', 'extraordinary human.']\n" + ] + } + ], + "source": [ + "# Using the same variables: chunk_size and chunk_overlap, instatiate RecursiveCharacterTextSplitter\n", + "rc_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap = chunk_overlap)\n", + "docs = rc_splitter.split_text(quote)\n", + "print(docs)" + ] + }, + { + "cell_type": "markdown", + "id": "8777067d-35d7-471d-983f-fec0a6aafbc0", + "metadata": {}, + "source": [ + "#### Load data into a vector database\n", + "At this stage, you will be faced with a decision to choose which vector database to use. \n", + "For our simple demonstration purpose, we will use [chromadb](https://www.trychroma.com), an open source vector database solution. The type of vector database solution you choose can depend on numerous factors such as:\n", + "- How large are the documents you will be processing\n", + "- How much money you have to spend on the project\n", + "- Efficiency/latency requirements for your use case, if you need to provide solution in real-time/fast, you may need a different solution\n", + "- Accuracy requirements. Sometimes there is a tradeoff between accuracy and latecy.\n", + "- Integration requirements with existing platforms. In somecases, people use ```PostgreSQL``` because they are already using it and it has enough add on extensions for vector database capabilities.\n", + "\n", + "Another decision choice is the **embedding model**- the LLM which converts the text/documents into vectors. There are many options on the market and the choice comes down to things such as:\n", + "- Available budget\n", + "- Compatibility with the LLM you are using in the generation phase. People do use a different embedding model from the generation model\n", + "> embedding_llm = Mistral, \n", + "> chat_model = ChatOpenAI\n", + "- Nature of documents, size and alot of other factors" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "79cbfc3e-d388-4410-8dff-56a46a47c53d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Collecting sentence_transformers\n", + " Downloading sentence_transformers-3.0.0-py3-none-any.whl (224 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m224.7/224.7 kB\u001b[0m \u001b[31m827.6 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: huggingface-hub>=0.15.1 in /Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages (from sentence_transformers) (0.23.2)\n", + "Requirement already satisfied: scipy in /Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages (from sentence_transformers) (1.10.1)\n", + "Requirement already satisfied: scikit-learn in /Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages (from sentence_transformers) (1.2.2)\n", + "Requirement already satisfied: torch>=1.11.0 in /Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages (from sentence_transformers) (2.1.2)\n", + "Requirement already satisfied: tqdm in /Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages (from sentence_transformers) (4.65.0)\n", + "Requirement already satisfied: Pillow in /Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages (from sentence_transformers) (9.4.0)\n", + "Requirement already satisfied: transformers<5.0.0,>=4.34.0 in /Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages (from sentence_transformers) (4.41.2)\n", + "Requirement already satisfied: numpy in /Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages (from sentence_transformers) (1.23.5)\n", + "Requirement already satisfied: fsspec>=2023.5.0 in /Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages (from huggingface-hub>=0.15.1->sentence_transformers) (2023.12.2)\n", + "Requirement already satisfied: typing-extensions>=3.7.4.3 in /Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages (from huggingface-hub>=0.15.1->sentence_transformers) (4.8.0)\n", + "Requirement already satisfied: packaging>=20.9 in /Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages (from huggingface-hub>=0.15.1->sentence_transformers) (23.2)\n", + "Requirement already satisfied: pyyaml>=5.1 in /Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages (from huggingface-hub>=0.15.1->sentence_transformers) (6.0.1)\n", + "Requirement already satisfied: filelock in /Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages (from huggingface-hub>=0.15.1->sentence_transformers) (3.9.0)\n", + "Requirement already satisfied: requests in /Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages (from huggingface-hub>=0.15.1->sentence_transformers) (2.28.1)\n", + "Requirement already satisfied: networkx in /Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages (from torch>=1.11.0->sentence_transformers) (2.8.4)\n", + "Requirement already satisfied: sympy in /Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages (from torch>=1.11.0->sentence_transformers) (1.12)\n", + "Requirement already satisfied: jinja2 in /Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages (from torch>=1.11.0->sentence_transformers) (3.1.2)\n", + "Requirement already satisfied: regex!=2019.12.17 in /Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages (from transformers<5.0.0,>=4.34.0->sentence_transformers) (2022.7.9)\n", + "Requirement already satisfied: tokenizers<0.20,>=0.19 in /Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages (from transformers<5.0.0,>=4.34.0->sentence_transformers) (0.19.1)\n", + "Requirement already satisfied: safetensors>=0.4.1 in /Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages (from transformers<5.0.0,>=4.34.0->sentence_transformers) (0.4.3)\n", + "Requirement already satisfied: threadpoolctl>=2.0.0 in /Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages (from scikit-learn->sentence_transformers) (2.2.0)\n", + "Requirement already satisfied: joblib>=1.1.1 in /Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages (from scikit-learn->sentence_transformers) (1.1.1)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages (from jinja2->torch>=1.11.0->sentence_transformers) (2.1.1)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages (from requests->huggingface-hub>=0.15.1->sentence_transformers) (2024.2.2)\n", + "Requirement already satisfied: charset-normalizer<3,>=2 in /Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages (from requests->huggingface-hub>=0.15.1->sentence_transformers) (2.1.1)\n", + "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages (from requests->huggingface-hub>=0.15.1->sentence_transformers) (1.26.15)\n", + "Requirement already satisfied: idna<4,>=2.5 in /Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages (from requests->huggingface-hub>=0.15.1->sentence_transformers) (3.4)\n", + "Requirement already satisfied: mpmath>=0.19 in /Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages (from sympy->torch>=1.11.0->sentence_transformers) (1.3.0)\n", + "Installing collected packages: sentence_transformers\n", + "Successfully installed sentence_transformers-3.0.0\n" + ] + } + ], + "source": [ + "!pip install sentence_transformers" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1ad4c76d-7248-49d6-acf2-3c1193bd2dcb", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_openai import OpenAIEmbeddings\n", + "from langchain_community.vectorstores import Chroma\n", + "from langchain_community.embeddings import HuggingFaceEmbeddings\n", + "\n", + "\n", + "# Lets load the Cholera paper and then store it in a database\n", + "loader = PyPDFLoader(str(FILE_HEP_CHAD))\n", + "data = loader.load()\n", + "\n", + "chunk_size = 100\n", + "chunk_overlap = 10\n", + "\n", + "# Split with RecursiveCharacterTextSplitter\n", + "rc_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap = chunk_overlap)\n", + "docs = rc_splitter.split_documents(data)\n", + "\n", + "# Lets use openAI embedding model\n", + "#embedding_model = OpenAIEmbeddings(openai_api_type=OPENAI_API_KEY)\n", + "embedding_model = HuggingFaceEmbeddings(model_name=\"all-MiniLM-L6-v2\")\n", + "# Directory to store our database-set this to the data directory\n", + "vectordb = Chroma(persist_directory=str(DIR_DATA), embedding_function=embedding_model)\n", + "\n", + "# Store the databse\n", + "vectordb.persist()\n", + "\n", + "# Create the database\n", + "docstorage = Chroma.from_documents(docs, embedding_model)" + ] + }, + { + "cell_type": "markdown", + "id": "e30d865a-42fb-4325-b53c-84da656a0703", + "metadata": {}, + "source": [ + "**EXERCISE-4. Explore what functionality is available under the database object ```docstorage_cholera```**\n", + "- You can use ```dir(object)``` to check available attributes and functions\n", + "- Note that there many search related functions which enables you to control how user queries are searcherd when building Chatbots" + ] + }, + { + "cell_type": "markdown", + "id": "52ea226f-8de3-4c5c-a69e-9d8971b047cf", + "metadata": {}, + "source": [ + "### 3.3 Retrieval\n", + "Now that we have added our external file. Lets use the added document as context in our LLM chains and ask questions again." + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "5af5276c-4235-44c8-926f-652acf3d16dd", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "============================================================\n", + "LLM Output without using RAG-external document from WHO website\n", + "============================================================\n", + "\n", + "\n", + "As of September 2021, there are several ongoing disease outbreaks in Chad. These include:\n", + "\n", + "1. COVID-19: Chad has been experiencing a surge in COVID-19 cases since April 2021, with a peak in July. As of September 2021, there have been over 5,000 confirmed cases and over 170 deaths.\n", + "\n", + "2. Cholera: A cholera outbreak was declared in June 2021 in the Lake Chad region, affecting areas near the border with Nigeria. As of September 2021, there have been over 2,000 suspected cases and 50 deaths.\n", + "\n", + "3. Measles: Chad has been experiencing a measles outbreak since January 2020. As of September 2021, there have been over 20,000 suspected cases and over 300 deaths, mainly affecting children under the age of 5.\n", + "\n", + "4. Yellow fever: A yellow fever outbreak was declared in November 2020, affecting several regions in Chad. As of September 2021, there have been over 60 confirmed cases and 10 deaths.\n", + "\n", + "5. Meningitis: Chad is currently experiencing a meningitis outbreak, with over 2,000 suspected cases and 200 deaths reported since the beginning of 2021.\n", + "\n", + "\n", + "\n", + "============================================================\n", + "LLM Output with RAG-external document from WHO website\n", + "============================================================\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n", + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n", + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Yes, there is currently a hepatitis E outbreak in Chad, specifically in the eastern Ouaddai province. This outbreak was last reported on May 8, 2024.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n" + ] + } + ], + "source": [ + "from langchain.chains import RetrievalQA\n", + "\n", + "# Create LLM as before \n", + "llm = OpenAI(openai_api_key=OPENAI_API_KEY)\n", + "\n", + "# Create retriever with \n", + "qa = RetrievalQA.from_chain_type(llm=llm, chain_type=\"stuff\", retriever=docstorage.as_retriever())\n", + "\n", + "# The question we will ask the LLM\n", + "# You can ask these questions in French and LLM will also answer in French\n", + "question = \"Are there any disease outbreaks in Chad?\"\n", + "\n", + "# Answer without RAG\n", + "output = llm.invoke(question)\n", + "print()\n", + "print(\"=\"*60)\n", + "print(\"LLM Output without using RAG-external document from WHO website\")\n", + "print(\"=\"*60)\n", + "print(output)\n", + "\n", + "# For RAG Chain, we put in the question as dictionary\n", + "print()\n", + "print(\"=\"*60)\n", + "print(\"LLM Output with RAG-external document from WHO website\")\n", + "print(\"=\"*60)\n", + "print(qa.run(question))" + ] + }, + { + "cell_type": "markdown", + "id": "3c735436-9e4a-410d-8f37-4f290cf51e1b", + "metadata": {}, + "source": [ + "**EXERCISE-5. Implement a simple RAG as we did above**\n", + "1. Use the ```FILE_MIDDLE_EAST_COVID``` file to create a new Chroma database\n", + "2. Implement a RAG chainas we did above.\n", + "3. Compare answers between a the LLM with RAG and no RAG\n", + "\n", + "**Hint.** Copy and paste the code from above and edit it." + ] + }, + { + "cell_type": "markdown", + "id": "6375cfcc-c6a9-406a-8e72-cd0a04a9b2ac", + "metadata": {}, + "source": [ + "### 3.4 Retrieval with sources reference\n", + "In reallife applications, you will have hundreds or thousands of documents. A user of your system may need to know the spurce of the answrs they are getting. Most RAG systems are able to provide details of where the information is coming from. For example, in the RAG-Malawi example, the RAG system can provide the page numbers. In this case, with LangChain, you can you can just provide information about the document where the answer came from.\n", + "\n", + "One method of mitigating the risk of LLM hallucinations from RAG is using RetrievalQAWithSourcesChain, which also returns the data source of the answer. Aside from the chain class, the code is exactly the same as RetrievalQA. However, this class returns a dictionary containing a 'sources' key and an 'answer' key. The 'sources' key refers to the file where the answer came from, which is helpful when there are many documents in the database." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "399ddc48-1e99-43a4-ae28-298d5427b367", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.chains import RetrievalQAWithSourcesChain\n", + "\n", + "qa = RetrievalQAWithSourcesChain.from_chain_type(llm=llm, chain_type=\"stuff\", retriever=docstorage.as_retriever())\n", + "\n", + "results = qa({\"question\": \"Are there any disease outbreaks in Chad?\"},\n", + " return_only_outputs=True)\n", + "print(results)" + ] + }, + { + "cell_type": "markdown", + "id": "d7b0f265-8583-4357-9040-8dd75429179c", + "metadata": {}, + "source": [ + "# 4. LangChain Expression Language (LCEL)\n", + "> In summary, LCEL is a different (recommended) syntax of achieving the same things we have done in LangChain\n", + "\n", + "LCEL is a key part of the LangChain toolkit. We can use it to connect prompts, models, and retrieval components using a **pipe (|)** operator rather than task-specific classes. It also lets us create complex workflows that work well in production environments. These chains have built-in support for batch processing, streaming, and asynchronous execution. This makes it easy to integrate with other LangChain tools and utilities like **LangSmith** and **LangServe**.\n", + "\n", + "A few notes about the chain with LCEL\n", + "- The ```| (pipe)``` in LCEL indicates that the output from one component will be used as the input to the next." + ] + }, + { + "cell_type": "markdown", + "id": "8229521e-790e-41b8-9fd8-159a54cae8c7", + "metadata": {}, + "source": [ + "## 4.1 A Simple Chain with LCEL" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "88efb378-36b2-4499-a033-c3145101475e", + "metadata": {}, + "outputs": [], + "source": [ + "model = ChatOpenAI(openai_api_key=OPENAI_API_KEY)\n", + "prompt = ChatPromptTemplate.from_template(\"You are a helpful personal assistant. \\\n", + "Answer the following question: {question}\")\n", + "\n", + "# Create Chain in LCEL fashion\n", + "llm_chain = prompt | model\n", + "\n", + "# Recall how we created a chain before \n", + "#llm_chain = LLMChain(prompt=prompt, llm=llm)\n", + "\n", + "\n", + "# Run using invoke\n", + "print(llm_chain.invoke(\"What is the capital of Tunisia?\"))" + ] + }, + { + "cell_type": "markdown", + "id": "1ac8949f-0ad0-4d22-8dbf-e8fd992f4065", + "metadata": {}, + "source": [ + "## 4.2 RAG with LCEL" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "68b9b2eb-db18-49e2-b75e-2c27fb862f2e", + "metadata": {}, + "outputs": [], + "source": [ + "model = ChatOpenAI(openai_api_key = OPENAI_API_KEY)\n", + "\n", + "embedding_model = OpenAIEmbeddings(openai_api_key = OPENAI_API_KEY)\n", + "vectorstore = Chroma.from_texts([\"Dunstan stayed in Tunis, the capital of Tunisia from Sunday May 26 to Satarday May 31.\"],embedding=embedding_model)\n", + "retriever = vectorstore.as_retriever()\n", + "\n", + "template = \"\"\"Answer the question based on the context:{context}. Question: {question}\"\"\"\n", + "prompt = ChatPromptTemplate.from_template(template)\n", + "\n", + "chain = ({\"context\": retriever,\"question\": RunnablePassthrough()} | prompt | model | StrOutputParser())\n", + "chain.invoke(\"When did Dunstan visit Tunisia?\")" + ] + }, + { + "cell_type": "markdown", + "id": "c6ac7202-4bb2-446d-85bf-e55bed0f53b1", + "metadata": {}, + "source": [ + "## 4.3 More things you can do with LCEL\n", + "There are alot of things you can do with LCEL. For example,\n", + "- **Batch or Streaming**. LCEL chains can be run in ```batch``` mode or ```streaming``` mode\n", + "- **Sequential chains.**. Sequential chains utilize step-by-step processing of inputs, where the output from one step becomes the input for the next. This enables a clear and organized flow of information within the chain. They provide flexibility in constructing custom pipelines by combining different components, such as prompts, models, retrievers, and output parsers, to suit specific use cases and requirements.\n", + "- **Passing Data Across Chains.** There are many cases where your application will require the use of several chains that pass outputs between them" + ] + }, + { + "cell_type": "markdown", + "id": "826bc2fc-cb3b-4d23-abab-52451461e0c4", + "metadata": {}, + "source": [ + "### Using sequential chaining to create Python code and check it with LCEL" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8bc0c20c-167c-454c-a87a-db01aa8f2855", + "metadata": {}, + "outputs": [], + "source": [ + "coding_prompt = PromptTemplate.from_template(\n", + " \"\"\"Write Python code to loop through the following list, printing each element: {list}\"\"\")\n", + "validate_prompt = PromptTemplate.from_template(\n", + " \"\"\"Consider the following Python code: {answer} If it doesn't use a list comprehension, update it to use one. If it does use a list comprehension, return the original code without explanation:\"\"\")\n", + "\n", + "llm = ChatOpenAI(openai_api_key=OPENAI_API_KEY)\n", + "\n", + "# Create the sequential chain\n", + "chain = ({\"answer\": coding_prompt | llm | StrOutputParser()}\n", + " | validate_prompt\n", + " | llm \n", + " | StrOutputParser() )\n", + "\n", + "# Invoke the chain with the user's question\n", + "print(chain.invoke({\"list\": \"[3, 1, 4, 1]\"}))" + ] + }, + { + "cell_type": "markdown", + "id": "2ae9fb57-5565-4bdf-a977-67f735260e51", + "metadata": {}, + "source": [ + "# 5. LangChain Agents\n", + "In LLMs and Gen AI, the idea behind agents is to use language models to determine which a sequence of actions to take to meet a pre-defined objective. Thus, the LLM is able solve complex problems or perform complex tasks by planning, determing what tools to use and what knowledge to get until the task is solved without explicit supervision.\n", + "\n", + "- Agents often use tools, which, in LangChain, are functions used by the agent to interact with the system. These tools can be high-level utilities to transform inputs, or they can be specific to a series of tasks. Agents can even use chains and other agents as tools!\n", + "- In LangChain, there different agent types. See [this documentation](https://python.langchain.com/v0.1/docs/modules/agents/agent_types/) for explanation of how the agents are categorized. \n", + "## Components of a LangChain Agent\n", + "There are four primary components to LangChain agents. \n", + "- The user input in the form of a prompt represents the initial input provided by the user. \n", + "- The definition for handling the intermediate steps explains how to handle and process actions during the agent's execution. \n", + "- The agent also needs to have a definition for the tools and model behavior to execute. \n", + "- The output parser formats the output generated by the model into the most appropriate format for the use case. Agents can be defined for specificity or high-level thought processes." + ] + }, + { + "cell_type": "markdown", + "id": "00fa06af-c231-4237-8d62-dc42ff0f59de", + "metadata": {}, + "source": [ + "## 5.1 Zero-Shot ReAct agent\n", + "ReAct stands for **Reasoning and Acting**. This simplifies the answer to infer as much context as possible. \n", + "We start by importing the initialize_agent function and AgentType for agent creation and configuration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a1e3cf87-c98c-4dfa-9ab4-65bdee6f47df", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.agents import initialize_agent, AgentType, load_tools\n", + "\n", + "# Define LLM\n", + "llm = OpenAI(model_name=\"gpt-3.5-turbo-instruct\", temperature=0, openai_api_key=OPENAI_API_KEY)\n", + "\n", + "# Define what tools the agent will will use, it can be more than one tool\n", + "tools = load_tools([\"llm-math\"], llm=llm)\n", + "agent = initialize_agent(tools, llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=True)\n", + "agent.run(\"What is 10 multiplied by 50?\")" + ] + }, + { + "cell_type": "markdown", + "id": "aab0fb85-881a-4b00-b1a6-1cee7d7a3f75", + "metadata": {}, + "source": [ + "## 5.2 Other Agents \n", + "There are alot of other agents and tools in LangChain. For example, in order to interact with a database or structured dataset we will utilise an ```SQLAgent```" + ] + }, + { + "cell_type": "markdown", + "id": "971ae0ee-c01d-4fea-b992-110d6c7e0edf", + "metadata": {}, + "source": [ + "# 6. Evaluating LLM Outputs in LangChain\n", + "As mentioned in Lectures, its important to evaluate LLM model outputs as well as all ML based outputs fot that matter. \n", + "Although Gen AI may seem very smart, the models still make alot of mistakes. As such, evaluating AI applications is important for several reasons. \n", + "- First, it checks if the AI model can accurately interpret and respond to a variety of inputs. This is vital in applications where responses inform decision-making, and reliability is paramount. \n", + "- Evaluation also help identify the strengths and weaknesses of a model, which allows for targeted and continuous improvements, and builds trust among users and stakeholders. \n", + "- Evaluation allows us to re-align model output with human intent, getting to the ideal responses faster.\n", + "\n", + "## LangChain evaluation tools\n", + "LangChain has built-in evaluation tools for comparing model outputs based on common criteria, such as relevance and correctness. It also provides tools for defining custom criteria, which we can tailor to specific use cases. Finally, the ```QAEvalChain class``` is another tool that can be used to measure how well an AI's response answers a specific question using ground truth responses." + ] + }, + { + "cell_type": "markdown", + "id": "abf96d76-5901-4fe0-83b5-881e6e340b92", + "metadata": {}, + "source": [ + "## 6.1 LangChain Built-in Evaluation Metrics" + ] + }, + { + "cell_type": "markdown", + "id": "c7752f75-65d4-4a91-83dc-3b4740e544d9", + "metadata": {}, + "source": [ + "**EXERCISE-6: Explore Evalution Metrics in LangChain**\n", + "- run this import statement: ```from langchain.evaluation import Criteria```\n", + "- use ``list`` function pn Criteria to check the list of available functions" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "ab44f6f3-2489-4670-88f4-a3361c6a7fc3", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'reasoning': 'Step 1: Identify the criterion - relevance.\\n\\nStep 2: Read the input and submission to determine if they are referring to a real quote from the text.\\n\\nStep 3: The input is asking a math question, not referring to a quote from the text.\\n\\nStep 4: The submission is referring to a different topic, the capital of New York state, and not a quote from the text.\\n\\nStep 5: Therefore, the submission does not meet the criterion of relevance.\\n\\nConclusion: The submission does not meet the criterion of relevance.', 'value': 'N', 'score': 0}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n" + ] + } + ], + "source": [ + "from langchain.chat_models import ChatOpenAI\n", + "from langchain.evaluation import load_evaluator\n", + "\n", + "\n", + "llm = OpenAI(openai_api_key=OPENAI_API_KEY)\n", + "\n", + "evaluator = load_evaluator(\"criteria\", criteria=\"relevance\",llm=llm)\n", + "eval_result = evaluator.evaluate_strings(prediction=\"The capital of New York state is Albany\",input=\"What is 26 + 43?\")\n", + "print(eval_result)" + ] + }, + { + "cell_type": "markdown", + "id": "28a2c168-9b70-427d-9e0b-231212ec7699", + "metadata": {}, + "source": [ + "**EXERCISE-7: Try doing the same evaluation above with a different LLM (e.g., Mistral)**" + ] + }, + { + "cell_type": "markdown", + "id": "ce53d768-a26c-4509-948e-c464ebd20310", + "metadata": {}, + "source": [ + "## 6.2 Defining Custom Metrics\n", + "To customize the criteria, we need to evaluate the specific use case and define a dictionary named custom_criteria. This example adds simplicity, bias, clarity, and truthfulness criteria. Custom criteria work by mapping criteria names to the questions that are used to evaluate the strings. To use these new criteria, create an evaluator object, but this time, using our custom_critera." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3b248976-85a5-4ef3-9998-48cf1afa9311", + "metadata": {}, + "outputs": [], + "source": [ + "custom_criteria = {\"simplicity\": \"Does the language use brevity?\",\n", + " \"bias\": \"Does the language stay free of human bias?\",\n", + " \"clarity\": \"Is the writing easy to understand?\",\n", + " \"truthfulness\": \"Is the writing honest and factual?\"}\n", + "\n", + "evaluator = load_evaluator(\"criteria\", criteria=custom_criteria,\n", + " llm=llm)\n", + "eval_result = evaluator.evaluate_strings(input=\"What is the best Italian restaurant in New York City?\",\n", + "prediction=\"That is a subjective statement and I cannot answer that.\")\n", + "print(eval_result)" + ] + }, + { + "cell_type": "markdown", + "id": "ef61e633-de72-4019-940c-7021b0e7c2e1", + "metadata": {}, + "source": [ + "## 6.3 QAEvalChain\n", + "Question-Answering (QA) is one of the most popular applications LLMs. But it is often not always obvious to determine what parameters (e.g., chunk size) or components (e.g., model choice, VectorDB) yield the best QA performance in the system we are building. The QA eval chain is an LLM chain for evaluting performance of an LLM on QA task. Refer to this detailed [LangChain blog post](https://blog.langchain.dev/auto-eval-of-question-answering-tasks/) for details about QAEvalChain." + ] + }, + { + "cell_type": "markdown", + "id": "1585ea3a-1be2-40db-9c88-774c03e220a7", + "metadata": {}, + "source": [ + "### 6.3.1 Trying out QAEvalChain\n", + "As a metric, QAEvalChain focuses on the **accuracy** and **relevance** of the response. In this chain, RAG will be used to store the document and ground truth responses, and an evaluation model instance is used to compare the semantic meaning of a model's results with the ground truth. \n", + "\n", + "First, we load our data source, in this case, a PDF document, and split it into chunks. Next, we set up the embeddings model, vector database, and LLM, and combine them in a chain. The input_key is set to \"question\", as questions will be used to query the database" + ] + }, + { + "cell_type": "markdown", + "id": "3e59bac0-ed4c-4ffe-b64c-423f88f1aab3", + "metadata": {}, + "source": [ + "### Create a RAG Retriever " + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "2fc52a82-7b90-4b0f-953a-32cb90beee54", + "metadata": {}, + "outputs": [], + "source": [ + "# Lets load the Cholera paper and then store it in a database\n", + "loader = PyPDFLoader(str(FILE_DENGUE))\n", + "data = loader.load()\n", + "\n", + "chunk_size = 100\n", + "chunk_overlap = 50\n", + "\n", + "# Split with RecursiveCharacterTextSplitter\n", + "rc_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap = chunk_overlap)\n", + "docs = rc_splitter.split_documents(data)\n", + "\n", + "# Lets use openAI embedding model\n", + "embedding_model = OpenAIEmbeddings(openai_api_type=OPENAI_API_KEY)\n", + "\n", + "# Directory to store our database-set this to the data directory\n", + "vectordb = Chroma(persist_directory=str(DIR_DATA), embedding_function=embedding_model)\n", + "\n", + "# Store the databse\n", + "vectordb.persist()\n", + "\n", + "# Create the database\n", + "docstorage = Chroma.from_documents(docs, embedding_model)\n", + "\n", + "# LLM\n", + "llm = OpenAI(model_name=\"gpt-3.5-turbo-instruct\", openai_api_key=OPENAI_API_KEY)\n", + "\n", + "# Define the retriever chain\n", + "qa = RetrievalQA.from_chain_type(llm=llm, chain_type=\"stuff\", retriever=docstorage.as_retriever(), input_key=\"question\")" + ] + }, + { + "cell_type": "markdown", + "id": "a9dddef3-e7ab-4c3e-9086-121be7b3b8a4", + "metadata": {}, + "source": [ + "## Define a Question Set as Key-Value Pairs in a Dict\n", + "This is a ground-truth dataset which a list of questions and their correct responses." + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "f4d95da4-f130-4a2f-b0e5-40d22146674e", + "metadata": {}, + "outputs": [], + "source": [ + "question_set = [{\"question\": \"Did dengue cases increase in 2023?\",\n", + " \"answer\": \"Yes, in 2023, there was an increase in cases globally.\"},\n", + " {\"question\": \"According to the document, which are the top four regions affected by arboviral diseases?\",\n", + " \"answer\": \"Africa is oe of the top four regions\"},\n", + " {\"question\": \"How is dengue virus transimitted to humans?\",\n", + " \"answer\": \"through the bite of infected mosquitoes\"}]" + ] + }, + { + "cell_type": "markdown", + "id": "7a509f17-4bec-4bdc-805e-321b37581a84", + "metadata": {}, + "source": [ + "## Run QAEVAL" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "5dff29e4-037b-464f-9135-3f311daf4727", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n", + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n", + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n", + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n", + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n", + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n", + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[{'results': ' CORRECT'}, {'results': ' INCORRECT'}, {'results': ' CORRECT'}]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n" + ] + } + ], + "source": [ + "from langchain.evaluation import QAEvalChain\n", + "predictions = qa.apply(question_set)\n", + "eval_chain = QAEvalChain.from_llm(llm)\n", + "\n", + "results = eval_chain.evaluate(question_set,predictions, question_key=\"question\",prediction_key=\"result\", answer_key='answer')\n", + "print(results)" + ] + }, + { + "cell_type": "markdown", + "id": "db20cb10-2438-43b2-b9d0-df3ae73d6418", + "metadata": {}, + "source": [ + "**EXERCISE-7 (Do this in Your Groups): Run Evaluation on a Custom Eval Dataset for a RAG Chatbot QA Task**\n", + "1. Create a RAG LLM Chain as we have done before.\n", + "Please identify a PDF document to use which contains some new information that the LLMs do not have. \n", + "Note that it can be a French or English document.\n", + "2. Create 5 pairs of questions and correct answers to use to evaluate your RAG\n", + "3. Run QAEVAL on the eval dataset and report how many responses did the LLM get correct.\n", + "4. Do this again with a different LLM (e.g., Falcon or Mistral) and compare performance across models. *Note that your eval dataset remains the same.*" + ] + }, + { + "cell_type": "markdown", + "id": "20065378-5cd5-4b0f-ae9a-c2869075441a", + "metadata": {}, + "source": [ + "# 7. Summary\n", + "-----\n", + "In this notebook, we covered the basics of how to use LangChain to interact with both proprietary models from OpenAI and open source LLMs through Hugging Face library. We noted that there are two approaches to building Chains with LangChain: either using the functions or using the LCEL syntax. We covered key topics as follows: creating chains and interacting with LLMs; managing memeory of chat models; setup a RAG based chains which incorprates external documents and evaluating LLM outputs. \n", + "\n", + "What we have covered in this notebook is the tip of the ice-berg just to get you started on building LLM based applications with LangChain and other tools. There are alot of other things to learn and check.\n", + "- What are other frameworks whoch perform the same tasks as LangChain?\n", + "- LangChain Agents and LLM agents in general\n", + "- Vector databases and their role \n", + "- How to work with different document sources (e.g., websites)\n", + "- How to choose embedding models and the influence they have on generation\n", + "- Which model to use: instruct/chat/text generation\n", + "- and more " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ab0ee5b1-8133-4405-98fc-e56057daece6", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python3.12-audio", + "language": "python", + "name": "audio" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.2" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/malawi-nov-24/README.md b/notebooks/malawi-nov-24/README.md new file mode 100644 index 0000000..38f8eff --- /dev/null +++ b/notebooks/malawi-nov-24/README.md @@ -0,0 +1,87 @@ +# Programming Activities for the Course + +This document outlines the programming activities for the course, focusing on hands-on projects to apply the concepts learned. The document is organized into two main sections: an introduction to LLM capabilities and LangChain, followed by a practical exercise on deploying a chatbot with Streamlit. + +## LLM Foundations-understanding the ML Process + +## Introducing LLM Capabilities and LangChain + +In this section, we explore the foundational capabilities of Large Language Models (LLMs) and how they can be applied in real-world scenarios. By leveraging LLMs, you can build applications such as chatbots, document analyzers, and automated support systems. + +### Understanding LLM Capabilities +LLMs are capable of generating human-like text, answering questions, summarizing content, and even performing tasks like sentiment analysis and named entity recognition. These models can process and interpret vast amounts of textual data, making them ideal for a variety of applications across domains. + +### Introducing LangChain +LangChain is a powerful framework that simplifies the process of integrating LLMs into applications. It provides modular components and utility functions to create chains (pipelines) that combine different tasks, such as prompting, data processing, and memory management, all within a cohesive system. LangChain makes it easier to build applications that require complex interactions with language models, including: + +- **Prompt Engineering**: Designing effective prompts to achieve desired responses from the LLM. +- **Data Handling**: Loading, processing, and storing large document corpora. +- **Chain Management**: Creating workflows that link multiple steps, such as data loading, prompt generation, and response handling. + +### Building Applications with LangChain +LangChain supports various use cases, including: + +1. **QA Chatbots**: Answering user questions based on specific datasets. +2. **Document Analysis**: Extracting information, summarizing content, or classifying documents. +3. **Automated Support Systems**: Handling customer service or FAQ queries. + +In this course, we will apply these capabilities to build a QA chatbot using LangChain, deploy it on Streamlit, and explore its functionality through real-world examples. + +--- + + +# Deploying a Chatbot on Streamlit +In this activity, you will use the knowledge gained from the LangChain Tutorial to explore a chatbot deployed on Streamlit. You will deploy this app on your computer and interact with it. + +## About Streamlit + +As discussed in the lectures, Streamlit is a platform that enables data scientists to deploy dynamic, data-based apps. It’s ideal for prototyping demonstration apps and sharing them with stakeholders before full-scale production deployment. + +## Initial Setup and Getting the Chatbot Files + +1. **Get OpenAI and Hugging Face API Credentials** + The chatbot uses OpenAI models, so you’ll need to sign up for an OpenAI developer account and obtain an API key. For a step-by-step guide on creating an OpenAI API key, search for instructions on ChatGPT. Similarly, create a Hugging Face account and obtain an API token. + +2. **Try the Chatbot on Streamlit Community Cloud** + Before downloading anything, you can try the chatbot on the Streamlit Community Cloud with just the OpenAI and Hugging Face keys. + +3. **Download or Clone the Project Repository** + To get the project files on your computer, either clone the GitHub repository (if familiar with Git) or download the repository as a zipped file. + +## Deploying the Streamlit App Locally + +1. **Unzip and Navigate to the Project Folder** + Once unzipped, open the project folder and follow the instructions on the GitHub page to deploy the chatbot. + +2. **Follow steps on GitHub project repository**. [Streamlit app repo](https://github.com/worldbank/RAG-Based-ChatBot-Example) + + +3. **Install Required Packages** + The `requirements.txt` file contains a list of all required packages. If you encounter a missing package error, try installing the package again (ensuring your virtual environment is activated). + +4. **Run the App Locally** + Run the app with the following command: + ```bash + streamlit run streamlit_app.py + ``` +5. **Test and Check**. When deployed locally, you can browse the files being used in the app. + +## Explore Important Scripts + +The essential components for building a chatbot with LangChain are organized into distinct, modular Python scripts. Let’s explore some of these elements. You can use VS Code or your preferred text editor for this task. + +### Loading Files +In real-life applications, you may need to load hundreds of documents, requiring a versatile function for file loading. This project includes two types of loaders: +- **`remote_loader.py`**: For loading documents from websites. +- **`local_loader.py`**: For loading documents from the local `data` folder. + +### Document Splitting +The `splitter.py` module uses the `RecursiveCharacterTextSplitter` strategy, with a chunk size of 1000 and an overlap of 0. This method helps in breaking down large documents into manageable sections for processing. + +### Prompt Chains +In the `full_chain.py`, `base_chain.py`, and `rag_chain.py` modules, you’ll find configurations for the specific LLM models and prompting strategies used. The project utilizes OpenAI chat models, with customized chains designed to guide interactions effectively. + +### Memory Management +Memory management strategies are also implemented to optimize the chatbot’s performance, particularly for long interactions or when processing large datasets. + + diff --git a/notebooks/nasa-apod.ipynb b/notebooks/nasa-apod.ipynb deleted file mode 100644 index 18553cb..0000000 --- a/notebooks/nasa-apod.ipynb +++ /dev/null @@ -1,282 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "90700fdc-fcc7-4e54-8c9e-449879d8c66d", - "metadata": { - "tags": [] - }, - "source": [ - "# Securely Using API Keys\n", - "\n", - "> The following are (opinionated) best practices to store and use API keys in your source code. If you disagree, please consider [contributing](https://github.com/worldbank/template/issues/new/choose). " - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "ac0ed1c0", - "metadata": {}, - "source": [ - "## Environment Variables\n", - "\n", - "An [environment variable](https://en.wikipedia.org/wiki/Environment_variable) is a dynamic-named value that can be used to store information on a computer. For instance, an environment variable can be used to store settings and/or privileged information (e.g. API keys) on your local computer or server.\n", - "\n", - "To set a environment variable to a new value, in **Unix-like** systems, you must pass a `name` and a `value` pair as shown below in the terminal.\n", - "\n", - "```shell\n", - "export SECRET_API_KEY = \n", - "```" - ] - }, - { - "cell_type": "markdown", - "id": "080bd097-f128-4759-946d-793368230804", - "metadata": { - "tags": [] - }, - "source": [ - "The `value` is accessible by the `name` without being exposed throughout the system. In particular, in [Python](https://python.org), the value can be retrieve as follows." - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "8d023b4e-496b-440c-91a7-199bceb44d7d", - "metadata": { - "tags": [] - }, - "source": [ - "```python\n", - "secret_api_key = os.getenv(\"SECRET_API_KEY\")\n", - "```" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "54a99582-d509-4ab8-be42-ddb4921c0f45", - "metadata": { - "tags": [] - }, - "source": [ - "Alternatively, it is customary to use a `.env` file to organize and load environments variables as needed. Packages such as [dotenv](https://www.npmjs.com/package/dotenv) and [python-dotenv](https://pypi.org/project/python-dotenv/) will automatically load environments variables for you from the `.env` file.\n", - "\n", - "```shell\n", - "source .env\n", - "```" - ] - }, - { - "cell_type": "markdown", - "id": "3c0cc26a-2a99-49b0-a406-d57f31fff8ee", - "metadata": {}, - "source": [ - "With [Python](https://python.org)," - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "960398ce-eadb-45e3-b160-53e6c9250dd0", - "metadata": { - "tags": [ - "remove_output" - ] - }, - "outputs": [], - "source": [ - "from dotenv import load_dotenv\n", - "\n", - "load_dotenv()" - ] - }, - { - "cell_type": "markdown", - "id": "bda573d0-c877-42e6-8ee5-3000b780b4b7", - "metadata": { - "tags": [] - }, - "source": [ - "With [Jupyter](https://jupyter.org)," - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "2e700464-b50d-4b06-b0aa-afaafa17e68e", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "%load_ext dotenv\n", - "%dotenv" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "660db869", - "metadata": {}, - "source": [ - "The template includes `.env.example` as an example; to use, simply rename it to `.env` and add your settings and secrets to it. Please note that `.env` **must** never be committed/versioned (for example, to GitHub) and **should** be ignored on `.gitignore`. " - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "74484f7e", - "metadata": {}, - "source": [ - "```{tip}\n", - "While environments variables are a convenient way to minimize the security risk, it is important to emphasize secrets are still stored in plaintext in your computer. It is strongly recommended to use instead a secret manager, such as [AWS Secrets Manager](https://aws.amazon.com/secrets-manager/) or [1Password](https://developer.1password.com/docs/cli/secret-references).\n", - "```" - ] - }, - { - "cell_type": "markdown", - "id": "14e89727", - "metadata": {}, - "source": [ - "## Astronomy Picture of the Day" - ] - }, - { - "cell_type": "markdown", - "id": "b4c0f3e8-7756-41bb-aa21-cc2eee5ff67f", - "metadata": {}, - "source": [ - "One of the most popular APIs is NASA's [Astronomy Picture of the Day](https://apod.nasa.gov/apod/astropix.html). Let's see in the following example how to use the NASA API with a secret API key." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "d797ef77-6ca4-4f9d-a1f8-abbfd9884b07", - "metadata": { - "tags": [ - "hide-cell" - ] - }, - "outputs": [], - "source": [ - "import os\n", - "\n", - "import httpx\n", - "from IPython.display import Image" - ] - }, - { - "cell_type": "markdown", - "id": "ece37244", - "metadata": {}, - "source": [ - "First, you will have to [generate your API key](https://api.nasa.gov) and set up the environment variable `NASA_API_KEY` with its value. Now you are ready to use it in your code. For instance, in this example, we assign it to `api_key`. Please note that the value is never exposed and the notebook can be securely shared with anyone. " - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "7b914e66-7ae8-4d8b-9621-d6dc5ec49631", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "api_key = os.getenv(\"NASA_API_KEY\")" - ] - }, - { - "cell_type": "markdown", - "id": "10b5b12a", - "metadata": {}, - "source": [ - "Now, we are ready to make the request to the NASA API. According to the [documentation](https://github.com/nasa/apod-api#docs), the `api_key` is passed a parameter to the GET request. " - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "1990c3b9-f145-4c1f-bbb5-82f50801a011", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "async with httpx.AsyncClient() as client:\n", - " r = await client.get(\n", - " \"https://api.nasa.gov/planetary/apod\", params={\"api_key\": api_key}\n", - " )" - ] - }, - { - "cell_type": "markdown", - "id": "e952e343", - "metadata": {}, - "source": [ - "Voilà!" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "bd1cb597-0144-43e8-bed8-12145a831a0c", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "Image(url=r.json()[\"hdurl\"])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8c7cb67e-c7ba-4ed3-bee0-36d303c1517d", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.12" - }, - "vscode": { - "interpreter": { - "hash": "ce6d896885f4e28373aa2ff7c44f136ed5a497e2abd203a79a632f5859ed7bb5" - } - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/notebooks/tunisia-may-24/1-text2sqL-demo.ipynb b/notebooks/tunisia-may-24/1-text2sqL-demo.ipynb new file mode 100644 index 0000000..779899e --- /dev/null +++ b/notebooks/tunisia-may-24/1-text2sqL-demo.ipynb @@ -0,0 +1,647 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "5eb5dc4b-add1-4b1d-8fbb-3a1e85e552f7", + "metadata": {}, + "source": [ + "# Chatting with a Population Dataset Using LangChain and LLMs\n", + "\n", + "----\n", + "\n", + "In this simple demonstration, we show how you can use natural language to query a structured dataset. The dataset is a 2018 population census enumeration level data from Malawi." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "a16425c5-c0ee-4bc9-8f80-1684edc5a843", + "metadata": {}, + "outputs": [ + { + "name": "stdin", + "output_type": "stream", + "text": [ + " ········\n" + ] + } + ], + "source": [ + "import getpass\n", + "import pandas as pd\n", + "import os\n", + "\n", + "os.environ[\"OPENAI_API_KEY\"] = getpass.getpass()" + ] + }, + { + "cell_type": "markdown", + "id": "5186c265-b78c-41f6-a4a4-4401e6ccb7cf", + "metadata": {}, + "source": [ + "## 1. Creating a SQLLite Database\n", + "Use a CSV file to create a database. The file which was used to create a database is shown below as a Pandas Dataframe" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "1b2308a3-7bb2-47ca-86c8-01ff060105e4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
RegionNameDistrictNameTANameEnumerationAreaTotalPopulationPopulationMalePopulationFemaleNumberHouseholds
0CentralNtchisiTA Malengaea-20307802633.0331.0302.0145.0
1CentralNtchisiTA Malengaea-203070251006.0507.0499.0226.0
2CentralNtchisiTA Malengaea-203070071503.0740.0763.0338.0
3CentralNtchisiTA Malengaea-203070051139.0553.0586.0251.0
4CentralNtchisiTA Malengaea-203070121400.0668.0732.0284.0
\n", + "
" + ], + "text/plain": [ + " RegionName DistrictName TAName EnumerationArea TotalPopulation \\\n", + "0 Central Ntchisi TA Malenga ea-20307802 633.0 \n", + "1 Central Ntchisi TA Malenga ea-20307025 1006.0 \n", + "2 Central Ntchisi TA Malenga ea-20307007 1503.0 \n", + "3 Central Ntchisi TA Malenga ea-20307005 1139.0 \n", + "4 Central Ntchisi TA Malenga ea-20307012 1400.0 \n", + "\n", + " PopulationMale PopulationFemale NumberHouseholds \n", + "0 331.0 302.0 145.0 \n", + "1 507.0 499.0 226.0 \n", + "2 740.0 763.0 338.0 \n", + "3 553.0 586.0 251.0 \n", + "4 668.0 732.0 284.0 " + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_pop = pd.read_csv(\"mw-ea-pop.csv\")\n", + "df_pop.head(5)" + ] + }, + { + "cell_type": "markdown", + "id": "1a28fd50-a6c4-4929-8212-b2f4c9889b80", + "metadata": {}, + "source": [ + "## 2. Setup LangChain for Connecting to Database\n", + "The tool we will use is called LangChain. Its a popular tool for creating apps ontop of LLMs. During the course, we will delve more into using LangChain." + ] + }, + { + "cell_type": "markdown", + "id": "cbe6719b-2c38-49d7-8c96-622fc6900207", + "metadata": {}, + "source": [ + "### 2.1 Import LangChain Packages and Setup Connection" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "e1ed50a5-adc2-4ab2-a4ca-ad9e487fa464", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_community.utilities import SQLDatabase\n", + "from langchain.chains import create_sql_query_chain\n", + "from langchain_openai import ChatOpenAI\n", + "\n", + "from langchain_community.tools.sql_database.tool import QuerySQLDataBaseTool\n", + "\n", + "from operator import itemgetter\n", + "\n", + "from langchain_core.output_parsers import StrOutputParser\n", + "from langchain_core.prompts import PromptTemplate\n", + "from langchain_core.runnables import RunnablePassthrough" + ] + }, + { + "cell_type": "markdown", + "id": "ef46f940-9e27-4e49-b84a-41001ba9a79d", + "metadata": {}, + "source": [ + "### 2.2 Create the SQL Agent and a Chain" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "c44bb470-9afd-4fcd-bf67-c3cc08ddfcb9", + "metadata": {}, + "outputs": [], + "source": [ + "# Test connection to the database\n", + "db = SQLDatabase.from_uri(\"sqlite:///mydatabase.db\")\n", + "llm = ChatOpenAI(model=\"gpt-3.5-turbo\", temperature=0)\n", + "\n", + "execute_query = QuerySQLDataBaseTool(db=db)\n", + "write_query = create_sql_query_chain(llm, db)\n", + "chain = write_query | execute_query" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "2da6cdd0-012b-4fb2-8ae3-bde91d67aa20", + "metadata": {}, + "outputs": [], + "source": [ + "answer_prompt = PromptTemplate.from_template(\n", + " \"\"\"Given the following user question, corresponding SQL query, and SQL result, answer the user question.\n", + "\n", + "Question: {question}\n", + "SQL Query: {query}\n", + "SQL Result: {result}\n", + "Answer: \"\"\"\n", + ")\n", + "\n", + "answer = answer_prompt | llm | StrOutputParser()\n", + "chain = (\n", + " RunnablePassthrough.assign(query=write_query).assign(\n", + " result=itemgetter(\"query\") | execute_query\n", + " )\n", + " | answer\n", + ")\n" + ] + }, + { + "cell_type": "markdown", + "id": "6f39f651-c3e4-4fa7-b67a-b6b8c91de57f", + "metadata": {}, + "source": [ + "## 3. Chat with the Data" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "2ff768f4-7622-43db-afbe-4155bf5eeff2", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n", + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n" + ] + }, + { + "data": { + "text/plain": [ + "'There are 33 districts in Malawi.'" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n" + ] + } + ], + "source": [ + "user_question = \"How many districts are there in Malawi\"\n", + "chain.invoke({\"question\": \"{}\".format(user_question)})" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "b6d13719-e60e-4cff-afdb-c76253a65fc1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "32" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# VERIFY THIS INFORMATION USING PYTHON\n", + "df_pop.DistrictName.nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "a6305316-c437-4e86-bb3a-8f75450897e6", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n", + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n" + ] + }, + { + "data": { + "text/plain": [ + "'There are 9,042,289 women in Malawi.'" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n" + ] + } + ], + "source": [ + "user_question = \"How many women are there in Malawi\"\n", + "chain.invoke({\"question\": \"{}\".format(user_question)})" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "23e0abae-b0af-4965-9c37-00cafc30db5d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "9042289.0" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# VERIFY THIS INFORMATION USING PYTHON\n", + "df_pop.PopulationFemale.sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "e18bb38b-6313-4fd8-b2b8-b079c0e21e31", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n", + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n" + ] + }, + { + "data": { + "text/plain": [ + "'There are 246,415 women in Salima district.'" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n" + ] + } + ], + "source": [ + "user_question = \"How many women are there in Salima district\"\n", + "chain.invoke({\"question\": \"{}\".format(user_question)})" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "d1f8a6ca-6d53-4eca-80ae-09eb07ad886f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "246415.0" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# We can check that the answer above is correct using Python code\n", + "df_pop.query('DistrictName == \"Salima\"')['PopulationFemale'].sum()" + ] + }, + { + "cell_type": "markdown", + "id": "358519f6-98e3-4d45-ba6d-96560c67dcab", + "metadata": {}, + "source": [ + "### Complicated question" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "96f94c49-1d06-4765-8af1-12b2d217b8da", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n", + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n" + ] + }, + { + "data": { + "text/plain": [ + "'Approximately 51.48% of the population in Malawi is female.'" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n" + ] + } + ], + "source": [ + "user_question = \"What percent of the population is female in Malawi?\"\n", + "chain.invoke({\"question\": \"{}\".format(user_question)})" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "2033ecd1-0714-4bb0-a582-7dde4c366364", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "51.482681744085504" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fem = df_pop.PopulationFemale.sum()\n", + "tot = df_pop.TotalPopulation.sum()\n", + "\n", + "fem/tot*100" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "c267ea6e-6028-4716-9731-30beabf8b3f1", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n", + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n" + ] + }, + { + "data": { + "text/plain": [ + "'Based on the SQL query and result provided, we are only retrieving the population of males in the specified region (Central, Ntchisi, TA Malenga) for the last four years. We are not directly comparing the number of men over the years to determine if they are increasing. To answer the user question accurately, we would need to retrieve the population data for men in Malawi over the last four years and compare the numbers to see if there is an increase.'" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n" + ] + } + ], + "source": [ + "user_question = \"are the number of men increasing in the four last years in malawi\"\n", + "chain.invoke({\"question\": \"{}\".format(user_question)})" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "986fd189-6019-4a8a-bf3a-4fdc4f6e708e", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n", + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n" + ] + }, + { + "data": { + "text/plain": [ + "'The fertility rate of Malawi can be calculated by dividing the total female population by the total population. \\n\\nFor the first set of data:\\nFertility rate = total_female_population / total_population\\nFertility rate = 1303 / 2604\\nFertility rate = 0.5008\\n\\nFor the second set of data:\\nFertility rate = total_female_population / total_population\\nFertility rate = 9042289 / 17563749\\nFertility rate = 0.5143\\n\\nTherefore, the fertility rate of Malawi is approximately 0.5008 for the first set of data and 0.5143 for the second set of data.'" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n" + ] + } + ], + "source": [ + "user_question = \"what is the fertilely rate of Malawi(Calculate)?\"\n", + "chain.invoke({\"question\": \"{}\".format(user_question)})" + ] + }, + { + "cell_type": "markdown", + "id": "8e977b26-373b-4482-b613-bd1f70276b6f", + "metadata": {}, + "source": [ + "## 4. EXERCISE: What Question Do You Want Me to Try?\n", + "Share any question in the chat you would like me to try based on this dataset so that we see how much it can handle. \n", + "\n", + "- **Share your question on the chat**\n", + "- **I will run the question here and we will inspect the response together**" + ] + }, + { + "cell_type": "markdown", + "id": "c07e2bc8-c41c-49dd-ac6b-1a77d6c6e168", + "metadata": {}, + "source": [ + "## 5. What We will Do During the Course\n", + "During the course we will use LangChain to build our own **Ask-A-Question (AAQ)** type \n", + "of Chatbot to enable a user to chat with a dataset by asking natural language questions. \n", + "We will build an interactive app like [this](https://llm-examples.streamlit.app) using Streamlit and be able to share it with others." + ] + }, + { + "cell_type": "markdown", + "id": "3b84bb26-3dff-4d1b-af02-8edc18ac2f36", + "metadata": {}, + "source": [ + "# Deployment\n", + "1. Web app\n", + "2. WhatsApp \n", + "2. Chatbot on website of NSO or Health ministry" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "72accb41-83da-4781-bf06-2488db5f91d1", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/tunisia-may-24/2-document-classification-with-sklearn.ipynb b/notebooks/tunisia-may-24/2-document-classification-with-sklearn.ipynb new file mode 100644 index 0000000..20c458b --- /dev/null +++ b/notebooks/tunisia-may-24/2-document-classification-with-sklearn.ipynb @@ -0,0 +1,575 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Building a Document Classification System\n", + "The NumPy (Numerical Python) library used for working iwith arrays, and the Scikit-learn library is a python library built on NumPy, SciPy and matplotlib for data analytics and machine learning. The NLTK (Natural Language Toolkit) provides access to over 50 corpora and lexical resources such as WordNet, along with a suite of text processing libraries for classification, tokenization, stemming, tagging, parsing, and semantic reasoning, wrappers for industrial-strength NLP libraries." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Ensuring that you have the necessary libraries\n", + "# !pip install nltk\n", + "# !pip install numpy\n", + "# !pip install scikit-learn" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import nltk\n", + "from nltk.corpus import reuters\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.svm import LinearSVC\n", + "from sklearn.metrics import accuracy_score, classification_report\n", + "\n", + "from sklearn.feature_extraction.text import CountVectorizer\n", + "from sklearn.naive_bayes import MultinomialNB" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 1. Load your data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The Reuters-21578 dataset is one of the most widely used data collections for text categorization research. It is a collection of documents with news articles and the original corpus has 10,369 documents and a vocabulary of 29,930 word and has labeled categories such as \"earnings\", \"acquisitions\".. etc. You can read metadata about the dataset on [Hugging Face](https://huggingface.co/datasets/ucirvine/reuters21578)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package reuters to\n", + "[nltk_data] /Users/dunstanmatekenya/nltk_data...\n", + "[nltk_data] Package reuters is already up-to-date!\n" + ] + }, + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# download the dataset\n", + "nltk.download('reuters')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# Load the Reuters-21578 dataset\n", + "documents = reuters.fileids()\n", + "train_docs = list(filter(lambda doc: doc.startswith(\"train\"), documents))\n", + "test_docs = list(filter(lambda doc: doc.startswith(\"test\"), documents))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2. Prepare your data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Prepare the data by extracting the raw text and category labels for both the training and testing documents. Assumption is that each document has only one category label, so we take only the first category label for each document." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "# Prepare the data\n", + "train_data = [reuters.raw(doc_id) for doc_id in train_docs]\n", + "train_labels = [reuters.categories(doc_id)[0] for doc_id in train_docs]\n", + "test_data = [reuters.raw(doc_id) for doc_id in test_docs]\n", + "test_labels = [reuters.categories(doc_id)[0] for doc_id in test_docs]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Question-How many different classes are in the training data?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Explore some of the training examples" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Article content: COMPUTER TERMINAL SYSTEMS <CPML> COMPLETES SALE\n", + " Computer Terminal Systems Inc said\n", + " it has completed the sale of 200,000 shares of its common\n", + " stock, and warrants to acquire an additional one mln shares, to\n", + " <Sedio N.V.> of Lugano, Switzerland for 50,000 dlrs.\n", + " The company said the warrants are exercisable for five\n", + " years at a purchase price of .125 dlrs per share.\n", + " Computer Terminal said Sedio also has the right to buy\n", + " additional shares and increase its total holdings up to 40 pct\n", + " of the Computer Terminal's outstanding common stock under\n", + " certain circumstances involving change of control at the\n", + " company.\n", + " The company said if the conditions occur the warrants would\n", + " be exercisable at a price equal to 75 pct of its common stock's\n", + " market price at the time, not to exceed 1.50 dlrs per share.\n", + " Computer Terminal also said it sold the technolgy rights to\n", + " its Dot Matrix impact technology, including any future\n", + " improvements, to <Woodco Inc> of Houston, Tex. for 200,000\n", + " dlrs. But, it said it would continue to be the exclusive\n", + " worldwide licensee of the technology for Woodco.\n", + " The company said the moves were part of its reorganization\n", + " plan and would help pay current operation costs and ensure\n", + " product delivery.\n", + " Computer Terminal makes computer generated labels, forms,\n", + " tags and ticket printers and terminals.\n", + " \n", + "\n", + " n\\, Label: acq\n" + ] + } + ], + "source": [ + "print(\"Article content: {} n\\, Label: {}\".format(train_data[1], train_labels[1]))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3. Vectorizing the text data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Vectorize the text data using the TfidVectorizer from scikit-learn. TF-IDF is an abbreviation for Term Frequency Inverse Document Frequency. This is very common algorithm to transform text into a meaningful representation of numbers which is used to fit machine algorithm for prediction. \n", + "- Its worth noting that nowadays, this vectorization approach is not commonly used. We will cover **word embeddings** tomorrow which is a better approach to represent words as numbers because **vector embeddings** can capture semantic meanings better.\n", + "\n", + "For the sklearn TF-IDF vectorizer, you can learn more about it [here](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "# Vectorize the text data\n", + "vectorizer = TfidfVectorizer(stop_words=\"english\", max_features=1000)\n", + "X_train = vectorizer.fit_transform(train_data)\n", + "X_test = vectorizer.transform(test_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Question: What role are the ```stop words``` playing in the code above? You might have learned this from Prof. Mohamad Ali already." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4. Training a Linear Support Vector Machine (LinearSVC) classifier using the vectorized training data and corresponding label" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
LinearSVC()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "LinearSVC()" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Train the classifier\n", + "classifier = LinearSVC()\n", + "classifier.fit(X_train, train_labels)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "classifier." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 5. Evaluate the classifier used and calculate the accuracy score as well as some other metrics (Precision, Recall and F-1 score)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Accuracy: 0.876117919841007\n", + " precision recall f1-score support\n", + "\n", + " acq 0.95 0.96 0.96 719\n", + " alum 0.33 0.18 0.24 22\n", + " barley 1.00 0.71 0.83 14\n", + " bop 0.77 0.80 0.79 30\n", + " carcass 0.79 0.65 0.71 17\n", + " castor-oil 0.00 0.00 0.00 1\n", + " cocoa 0.94 1.00 0.97 17\n", + " coconut 0.00 0.00 0.00 2\n", + " coconut-oil 0.00 0.00 0.00 2\n", + " coffee 0.89 0.96 0.92 25\n", + " copper 0.93 0.93 0.93 15\n", + " corn 0.85 0.81 0.83 48\n", + " cotton 1.00 0.86 0.92 14\n", + " cpi 0.62 0.62 0.62 24\n", + " cpu 0.00 0.00 0.00 1\n", + " crude 0.79 0.93 0.86 182\n", + " dfl 0.00 0.00 0.00 1\n", + " dlr 0.70 0.72 0.71 43\n", + " dmk 0.00 0.00 0.00 1\n", + " earn 0.98 0.99 0.98 1083\n", + " fuel 1.00 0.22 0.36 9\n", + " gas 0.75 0.33 0.46 9\n", + " gnp 0.59 0.89 0.71 19\n", + " gold 0.96 0.96 0.96 26\n", + " grain 0.71 0.77 0.74 77\n", + " groundnut 0.00 0.00 0.00 3\n", + " heat 1.00 0.75 0.86 4\n", + " hog 1.00 0.50 0.67 4\n", + " housing 1.00 0.67 0.80 3\n", + " income 1.00 0.80 0.89 5\n", + " instal-debt 1.00 1.00 1.00 1\n", + " interest 0.78 0.76 0.77 124\n", + " ipi 1.00 1.00 1.00 11\n", + " iron-steel 0.69 0.64 0.67 14\n", + " jet 0.00 0.00 0.00 1\n", + " jobs 0.73 0.85 0.79 13\n", + " l-cattle 0.00 0.00 0.00 2\n", + " lead 0.83 0.42 0.56 12\n", + " lei 1.00 1.00 1.00 3\n", + " livestock 0.50 0.50 0.50 6\n", + " lumber 0.00 0.00 0.00 5\n", + " meal-feed 0.20 0.17 0.18 6\n", + " money-fx 0.65 0.65 0.65 96\n", + " money-supply 0.80 0.83 0.81 29\n", + " naphtha 0.00 0.00 0.00 1\n", + " nat-gas 0.64 0.54 0.58 13\n", + " nickel 0.00 0.00 0.00 1\n", + " oilseed 0.54 0.54 0.54 13\n", + " orange 0.75 0.33 0.46 9\n", + " palladium 0.00 0.00 0.00 1\n", + " palm-oil 0.67 1.00 0.80 4\n", + " pet-chem 1.00 0.50 0.67 6\n", + " platinum 0.00 0.00 0.00 3\n", + " potato 1.00 0.67 0.80 3\n", + " propane 0.00 0.00 0.00 2\n", + " rape-oil 0.00 0.00 0.00 1\n", + " reserves 1.00 0.64 0.78 14\n", + " retail 1.00 1.00 1.00 1\n", + " rice 0.00 0.00 0.00 1\n", + " rubber 0.69 1.00 0.82 9\n", + " ship 0.39 0.41 0.40 39\n", + " silver 0.00 0.00 0.00 0\n", + " soy-oil 0.00 0.00 0.00 2\n", + " soybean 0.00 0.00 0.00 2\n", + "strategic-metal 0.00 0.00 0.00 6\n", + " sugar 0.71 0.96 0.81 25\n", + " tea 0.00 0.00 0.00 3\n", + " tin 0.71 0.50 0.59 10\n", + " trade 0.70 0.93 0.80 76\n", + " veg-oil 0.54 0.64 0.58 11\n", + " wpi 0.62 0.56 0.59 9\n", + " yen 0.00 0.00 0.00 6\n", + " zinc 0.00 0.00 0.00 5\n", + "\n", + " accuracy 0.88 3019\n", + " macro avg 0.53 0.48 0.49 3019\n", + " weighted avg 0.86 0.88 0.87 3019\n", + "\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n", + "/Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n", + "/Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n", + "/Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n", + "/Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n", + "/Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in labels with no true samples. Use `zero_division` parameter to control this behavior.\n", + " _warn_prf(average, modifier, msg_start, len(result))\n" + ] + } + ], + "source": [ + "# Evaluate the classifier\n", + "y_pred = classifier.predict(X_test)\n", + "accuracy = accuracy_score(test_labels, y_pred)\n", + "print(\"Accuracy:\", accuracy)\n", + "print(classification_report(test_labels, y_pred))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 6. Classify new documents (new BBC headlines) by vectorizing them using the same TfidfVectorizer and predicting their labels using the trained classifier" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Predicted labels: ['ship' 'ship' 'acq']\n" + ] + } + ], + "source": [ + "# Classify new documents (recent headlines obtained from BBC news regarding Tunisia)\n", + "new_docs = [\n", + " \"Tunisia says 23 people missing in Mediterranean sea.\",\n", + " \"Tunisia officials arrested in dispute over flag display.\",\n", + " \"Tunisia lawyer arrested during live news broadcast.\"\n", + "]\n", + "new_docs_vectors = vectorizer.transform(new_docs)\n", + "predicted_labels = classifier.predict(new_docs_vectors)\n", + "print(\"Predicted labels:\", predicted_labels)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Discussion" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "How did this classifier fare? What can you do to improve the model?
\n", + "Ans: Experimenting with different preprocessing techniques, feature extraction models and classification algorithms." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Trying with a different classifier" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Steps 1 - 3 will be the same." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load the Reuters-21578 dataset\n", + "documents = reuters.fileids()\n", + "train_docs = list(filter(lambda doc: doc.startswith(\"train\"), documents))\n", + "test_docs = list(filter(lambda doc: doc.startswith(\"test\"), documents))\n", + "\n", + "# Prepare the data\n", + "train_data = [reuters.raw(doc_id) for doc_id in train_docs]\n", + "train_labels = [reuters.categories(doc_id)[0] for doc_id in train_docs]\n", + "test_data = [reuters.raw(doc_id) for doc_id in test_docs]\n", + "test_labels = [reuters.categories(doc_id)[0] for doc_id in test_docs]\n", + "\n", + "# Vectorize the text data\n", + "vectorizer = CountVectorizer(stop_words=\"english\", max_features=1000)\n", + "X_train = vectorizer.fit_transform(train_data)\n", + "X_test = vectorizer.transform(test_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Different Classifier (Multinomial Naive Bayes)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "classifier = MultinomialNB()\n", + "classifier.fit(X_train, train_labels)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Evaluate the classifier\n", + "y_pred = classifier.predict(X_test)\n", + "accuracy = accuracy_score(test_labels, y_pred)\n", + "print(\"Accuracy:\", accuracy)\n", + "print(classification_report(test_labels, y_pred))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Classify new documents (recent headlines obtained from BBC news regarding Tunisia)\n", + "new_docs = [\n", + " \"Tunisia says 23 people missing in Mediterranean sea.\",\n", + " \"Tunisia officials arrested in dispute over flag display.\",\n", + " \"Tunisia lawyer arrested during live news broadcast.\"\n", + "]\n", + "new_docs_vectors = vectorizer.transform(new_docs)\n", + "predicted_labels = classifier.predict(new_docs_vectors)\n", + "print(\"Predicted labels:\", predicted_labels)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Discussion: Compare the results" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The choice of classifier depends on the specific characteristics of your dataset and the problem at hand. Multinomial Naive Bayes is known to work well with text data and can handle high-dimensional feature spaces efficiently. However, it assumes that the features are independent of each other, which may not always be the case in real-world scenarios.\n", + "\n", + "You can also experiment with different classifiers, such as Logistic Regression, Random Forest, or Gradient Boosting, and compare their performance to find the best fit for your dataset. You can also refine the model by trying different feature extraction techniques and hyperparameters." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### There are also other ways you can approach this, for example, Document Classification using BERT. Here is a notebook example on Kaggle that you can explore: https://www.kaggle.com/code/merishnasuwal/document-classification-using-bert" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "BERT (Bidirectional Encoder Representations from Transformers) and other Transformer encoder architectures can also be used on a variety of tasks in NLP (natural language processing). They compute vector-space representations of natural language that are suitable for use in deep learning models. The BERT family of models uses the Transformer encoder architecture to process each token of input text in the full context of all tokens before and after. BERT models are usually pre-trained on a large corpus of text, then fine-tuned for specific tasks." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.8" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/tunisia-may-24/3-intro-langchain.ipynb b/notebooks/tunisia-may-24/3-intro-langchain.ipynb new file mode 100644 index 0000000..fe6aef5 --- /dev/null +++ b/notebooks/tunisia-may-24/3-intro-langchain.ipynb @@ -0,0 +1,1896 @@ +{ + "cells": [ + { + "attachments": { + "7153af0c-fb8b-4b47-826e-57ac60696e0c.png": { + "image/png": "" + }, + "faf11697-6be8-49bc-ab24-b3c4385b8a67.png": { + "image/png": "" + } + }, + "cell_type": "markdown", + "id": "740ffa74-4eda-4843-9b5b-486caab1153b", + "metadata": { + "tags": [] + }, + "source": [ + "# Introduction to LangChain\n", + "---------\n", + "![image.png](attachment:faf11697-6be8-49bc-ab24-b3c4385b8a67.png)![image.png](attachment:7153af0c-fb8b-4b47-826e-57ac60696e0c.png)\n", + "\n", + "**DIHPA'24**\n", + "\n", + "**Author:** Dunstan Matekenya \n", + "\n", + "**Affiliation:** DECAT, The World Bank Group \n", + "\n", + "**Date:** May 30, 2024\n", + "\n", + "\n", + "## What you will learn \n", + "In this notebook, you will learn the basics of the LangChain platform as follows.\n", + "1. **LLM capabilities.** Explore LLM capabilities using LangChain\n", + "2. **Interacting with LLMs.** Use LangChain functions such as chains, prompt templates and more to connect to LLMs\n", + "3. **RAG.**. Implementing a simple RAG in Langchain by connecting to external documents\n", + "4. **LangChain Expression Language (LCEL).**. How to use LCEL instead of functions when interacting with LLMs\n", + "5. **LangChain Agents.**. \n", + "\n", + "## Expected Broad Learning Outcomes\n", + "1. **Connecting to LLMs.** An understanding of how to connect to varios open source and proprietary LLMs using Hugging Face and proprietary specific frameworks such as that for OpenAI and Mistral\n", + "2. **Different LLMs.**. There are many varieties of LLMs: ```chat, instruct, question-answer, sentiment-analysis, instruct``` and more. Have basic understanding of differences across these models and when to use which one.\n", + "3. **The role of memory in Chat models.** Understand the importance of having memory in a chatbot and different strategies for doing it with LangChain.\n", + "4. **The process of implementing RAG in LangChain**. RAG is one of the most commonly used approach for implementing chats as it enables connection to external custom data. Have a good understanding of the main steps involved in implementing a RAG based system-the steps are the same in LangChain and other frameworks.\n", + "5. **Understand the role vector databases.** Vector databases are an integral part of working with LLMs. make sure you understand how they fit in the ecosystem and why they are important." + ] + }, + { + "cell_type": "markdown", + "id": "633b7017-2001-4cec-b34d-30a5bc4b92fc", + "metadata": {}, + "source": [ + "# Setup\n", + "\n", + "------" + ] + }, + { + "cell_type": "markdown", + "id": "7a131a8d-40d2-4bf3-9856-103ed70000d7", + "metadata": {}, + "source": [ + "## Import Packages\n", + "We will import packages as we go so that you appreciate which class we are using." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "b9a13ee9-f3d9-4141-a1a2-929cdc1b5113", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "from pathlib import Path" + ] + }, + { + "cell_type": "markdown", + "id": "1e1dad1b-4014-48e9-b911-2095c9864a84", + "metadata": {}, + "source": [ + "## Setup API Keys" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "b443a7a8-9dd7-4320-958e-cc3376e09cb4", + "metadata": {}, + "outputs": [], + "source": [ + "# ====================\n", + "# Setup API Keys\n", + "# ====================\n", + "# Although its not recommended for security, you can also just \n", + "# paste your API keys " + ] + }, + { + "cell_type": "markdown", + "id": "61224418-685b-499c-96fe-568ba7993475", + "metadata": {}, + "source": [ + "## Setup input directories \n", + "Lets organize where our data is stored so that we can easily access it. Please refer to the slides for recommended folder setup. Copy and paste the full paths to your working folder in the variables below." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "a234a85f-b21b-4378-b3bd-f27feec67c36", + "metadata": {}, + "outputs": [], + "source": [ + "# Replace this folder with your working folder \n", + "DIR_WD = Path(\"/Users/dunstanmatekenya/Google Drive/My Drive/GenAI-Course/Mod2-LLM-Overview/\")\n", + "\n", + "# data folder\n", + "DIR_DATA = DIR_WD.joinpath(\"data\")\n", + "\n", + "# We can also set file names for data files we will use to save time\n", + "FILE_HEP_CHAD = DIR_DATA.joinpath(\"Hepatitis-Chad.pdf\")\n", + "\n", + "FILE_MIDDLE_EAST_COVID = DIR_DATA.joinpath(\"MidEast-COVID.pdf\")\n", + "\n", + "FILE_DENGUE = DIR_DATA.joinpath(\"Dengue-Global-situation.pdf\")" + ] + }, + { + "cell_type": "markdown", + "id": "2f148739-53c8-4cb8-b477-da5781e8195d", + "metadata": {}, + "source": [ + "# 1. Exploring Language Tasks that LLMs can Perform\n", + "In this section, we will explore what type of NLP tasks LLMs can perfom using the Hugging Face transformer package. In some cases, when we specifiy a specific model, the transformers package will take some time to download the model files. Also, the idea here is to show very simple capabilities. In a real world project, you can train and fine-tune the transformer models on your own dataset. For example, to do a fully fledged sentiment analysis with Hugging Face, take a look at [this tutorial] (https://huggingface.co/blog/sentiment-analysis-python).\n", + "\n", + ">Note that for almost all of these tasks, you can replace the English text with French text and still get similar results" + ] + }, + { + "cell_type": "markdown", + "id": "2dc053ef-5ba8-4557-b0f2-0975447d566e", + "metadata": {}, + "source": [ + "## 1. 1 Text and Document Classification\n", + "Text and document classification are closely related tasks. In **text classification**, we assign predefined categories to individual pieces of text while in **document classification** refers to the process of assigning predefined categories to longer pieces of text, such as entire documents, articles, or reports.\n", + "\n", + "- **Examples of text classification tasks**. Sentiment Analysis; Intent Detection;\n", + "- **Examples of document classification**. Topic categorization, " + ] + }, + { + "cell_type": "markdown", + "id": "860fcb52-c3f4-4c8f-8fb4-e2b4808114c2", + "metadata": {}, + "source": [ + "### Sentiment Analysis with the Hugging Face Transformers Library" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "499b7d29-d647-4cb9-ae00-fd2b88ae18b5", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", + "No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).\n", + "Using a pipeline without specifying a model name and revision in production is not recommended.\n", + "/Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "POSITIVE\n" + ] + } + ], + "source": [ + "# We use transformers ```pipeline library\n", + "from transformers import pipeline\n", + "\n", + "llm = pipeline(\"text-classification\")\n", + "text = \"I'm really enjoying my stay in Tunis\"\n", + "outputs = llm(text)\n", + "print(outputs[0]['label'])" + ] + }, + { + "cell_type": "markdown", + "id": "affcc68c-49ea-4cfe-bc59-294854156360", + "metadata": {}, + "source": [ + "## 1.2 Text Generation\n", + "Text generation is a process in natural language processing (NLP) where a machine learning model generates coherent and contextually relevant text based on a given input or prompt. This technology is used in various applications such as chatbots, automated content creation, machine translation, and more.\n", + "\n", + "In real life, the text is not always coherent, based on the model, when we use a default model, the results are not good. " + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "0224c530-1943-481e-b4fc-92cfb2f62702", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "No model was supplied, defaulted to openai-community/gpt2 and revision 6c0e608 (https://huggingface.co/openai-community/gpt2).\n", + "Using a pipeline without specifying a model name and revision in production is not recommended.\n", + "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n", + "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Malawi is famous for urchin, the rice used in the national diet, and its high price and widespread lack of access to good sources of water are driving thousands of people into poverty. So far, the government has been able to bring in more than $2 billion through loans, while food aid has been limited in its expansion, for example by one million poor people trying to come back from war-ravaged country.\n", + "\n", + "As for the country's food safety, the government has been\n" + ] + } + ], + "source": [ + "llm = pipeline(\"text-generation\")\n", + "prompt = \"Malawi is famous for \"\n", + "outputs = llm(prompt, max_length=100)\n", + "print(outputs[0]['generated_text'])" + ] + }, + { + "cell_type": "markdown", + "id": "aabac32c-0547-44d4-b77b-34d16a2d8220", + "metadata": {}, + "source": [ + "**EXERCISE-0: Try to specify a different Hugging Face model and see if you get better results**" + ] + }, + { + "cell_type": "markdown", + "id": "b6bf9c35-35f0-4310-adff-6dbb7eab57f4", + "metadata": {}, + "source": [ + "## 1.3 Text Summarization\n", + "Text summarization is a natural language processing (NLP) task that involves creating a concise and coherent summary of a longer text document. The goal is to capture the most important information and main ideas while reducing the length of the original text. " + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "d0f4c46a-b896-47ce-928b-0c8b8bae063d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Walking amid Gion's Machiya wooden houses is a mesmerizing experience. The beautifullypreserved structures exuded an old-world charm that transports visitors back in time. The glow of lanterns lining the narrow streets add to theenchanting ambiance, making each stroll a\n" + ] + } + ], + "source": [ + "llm= pipeline(\"summarization\", model=\"facebook/bart-large-cnn\")\n", + "long_text = \"\"\"Walking amid Gion's Machiya wooden houses is a mesmerizing experience. The beautifully\n", + "preserved structures exuded an old-world charm that transports visitors back in time, making them feel\n", + "like they had stepped into a living museum. The glow of lanterns lining the narrow streets add to the\n", + "enchanting ambiance, making each stroll a memorable journey through Japan's rich cultural history.\n", + "\"\"\"\n", + "outputs = llm(long_text, max_length=60, clean_up_tokenization_spaces=True)\n", + "print(outputs[0]['summary_text'])" + ] + }, + { + "cell_type": "markdown", + "id": "58c10bc4-2237-45b2-a367-c7690a2586b4", + "metadata": {}, + "source": [ + "## 1.4 Question-Answering\n", + "Question Answering (QA) is one of the most common tasks or use casef for LLMs. In this task, the model is designed to automatically answer questions posed by humans in natural language. QA systems can be built to answer questions from a variety of sources, such as structured databases, knowledge bases, or unstructured text documents." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "a8aefbf4-a88a-4cdc-b882-eb0a5781200b", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "No model was supplied, defaulted to distilbert/distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad).\n", + "Using a pipeline without specifying a model name and revision in production is not recommended.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "wooden\n" + ] + } + ], + "source": [ + "llm = pipeline(\"question-answering\")\n", + "context = \"Walking amid Gion's Machiya wooden houses was a mesmerizing experience.\"\n", + "question = \"What are Machiya houses made of?\"\n", + "outputs = llm(question=question, context=context)\n", + "print(outputs['answer'])" + ] + }, + { + "cell_type": "markdown", + "id": "eea8c421-6a5d-426c-8f9f-fc94bc61b492", + "metadata": {}, + "source": [ + "## 1.5 Language Translation" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "cde7750a-4ab4-4a5d-9019-7dc7be065719", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "No model was supplied, defaulted to google-t5/t5-base and revision 686f1db (https://huggingface.co/google-t5/t5-base).\n", + "Using a pipeline without specifying a model name and revision in production is not recommended.\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "C'est ma première visite en Tunisie.\n" + ] + } + ], + "source": [ + "llm = pipeline(\"translation_en_to_fr\")\n", + "text = \"This is my first time to visit Tunisia.\"\n", + "outputs = llm(text, clean_up_tokenization_spaces=True)\n", + "print(outputs[0]['translation_text'])" + ] + }, + { + "cell_type": "markdown", + "id": "d3e3d82d-f4ea-439b-9fd5-08ae2105f3a3", + "metadata": {}, + "source": [ + "# 2. Introducing LangChain Core Functionalities" + ] + }, + { + "cell_type": "markdown", + "id": "0d4c7bd8-c8cb-4e99-bef5-40144a82c78c", + "metadata": {}, + "source": [ + "It is always a good idea to read documentation of a framework. Please head over to [LangChain website](https://www.langchain.com) for details of core functionalities, use cases and features. The screenshot below provides a summary of LangChain ecosytem of features and capabilities. The term **Chain** in LangChain refers to the core concept of **chains** in LangChain which is a sequence(s) of calls - whether to an LLM, a tool, or a data preprocessing step. The primary supported way to do this is with LCEL (we will see this later)." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "b02bc8a4-ea90-4622-92d8-43861fcb12d2", + "metadata": {}, + "outputs": [], + "source": [ + "from IPython.display import Image" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "2e39d82c-7089-4aa4-8054-d01427f1c1b3", + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "" + ] + }, + "execution_count": 11, + "metadata": { + "image/png": { + "width": 500 + } + }, + "output_type": "execute_result" + } + ], + "source": [ + "Image(filename='../images/LangChain-detailed.png', width=500) " + ] + }, + { + "cell_type": "markdown", + "id": "40dcce20-057f-4b5b-84b6-356a3c1db8a7", + "metadata": {}, + "source": [ + "## 2.1 Interacting with Models in LangChain \n", + "- General instruction models - Models which can answer questions but are not quite optmized for chat\n", + "- Chat models are more optimized for question and answering\n", + "- Prompting templates and techniques " + ] + }, + { + "cell_type": "markdown", + "id": "33a7bf7e-0ea4-431f-9f3e-23119e1a14a7", + "metadata": {}, + "source": [ + "### Trying out Open Vs. Proprietary Model\n", + "- **Accessing open source LLMs on Hugging Face.** In order to access open source LLMs from Hugging Face, you need two main inputs: ```Hugging Face token``` and the model id or url. Recall that you can explore and grab model details from the Hugging Face platform easily. Once you have that we can use ```HuggingFaceEndpoint``` or ```HuggingFaceHub``` to access and use the model.\n", + "\n", + "- **Accessing proprietary LLMs (e.g., OpenAI).** LangChain has specific packages for working with OpenAI models. For other providers such as Mistral, you need to check [LangChain documentation](https://python.langchain.com/v0.1/docs/integrations/chat/mistralai/)." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "0f911200-d6b2-47a2-b862-643f7f61bd83", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Can you still have fun in the rain?\n", + "Yes, you can still have fun in the rain! There are plenty of activities you can do indoors or outdoors, such as playing board games, reading a book, or going for a walk. You can also try to find creative ways to enjoy the rain, such as using a rain shower to take a bath or making a rain-soaked picnic. Just remember to stay safe and take precautions if necessary.\n" + ] + } + ], + "source": [ + "from langchain_community.llms import HuggingFaceEndpoint, HuggingFaceHub\n", + "\n", + "# Lets make this a global variable in case we want to use this model\n", + "# again\n", + "MODEL_ID_FALCON = 'tiiuae/falcon-7b-instruct'\n", + "\n", + "llm = HuggingFaceHub(repo_id=MODEL_ID_FALCON, \n", + " huggingfacehub_api_token=HUGGINGFACEHUB_API_TOKEN)\n", + "\n", + "question = 'Can you still have fun'\n", + "output = llm.invoke(question)\n", + "print(output)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "b35f1964-5938-4773-b768-334df1551939", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n", + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " if you're dead inside?\n", + "\n", + "It is possible to have fun even if you feel dead inside. While feeling emotionally numb or disconnected can make it more challenging to enjoy activities or events, it is still possible to find moments of joy and pleasure.\n", + "\n", + "Here are some tips for having fun even if you feel dead inside:\n", + "\n", + "1. Engage in activities that have brought you joy in the past. Think back to activities or hobbies that you used to enjoy before you started feeling dead inside. Even if you don't feel the same level of excitement, engaging in these activities can still bring some enjoyment.\n", + "\n", + "2. Try something new. Sometimes, trying something new can help break out of a rut and bring some fun into your life. This could be a new hobby, sport, or even a new type of food.\n", + "\n", + "3. Spend time with loved ones. Being around people who care about you and make you feel loved and supported can help lift your mood and bring some fun into your life. Plan a fun outing or simply spend time talking and laughing with friends and family.\n", + "\n", + "4. Practice self-care. Taking care of yourself can help improve your overall mood and make it easier to have fun. Make time for activities that help you relax and recharge, such as taking a bath, reading a book,\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n" + ] + } + ], + "source": [ + "from langchain_openai import OpenAI\n", + "\n", + "# Note that we will be able to select specific OpenAI models \n", + "# If you have a paid account \n", + "llm = OpenAI(openai_api_key=OPENAI_API_KEY)\n", + "\n", + "question = 'Can you still have fun'\n", + "output = llm.invoke(question)\n", + "print(output)" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "e9bda8e1-dce1-49c1-b308-82063fa53e6a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "3544" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "2*1772" + ] + }, + { + "cell_type": "markdown", + "id": "7381c4a5-5a33-405c-b82a-baacfebe6e56", + "metadata": {}, + "source": [ + "**EXERCISE-1. Find another model on Hugging Face to try**\n", + "- Go to [Hugging Face](https://huggingface.co/models)\n", + "- Search for **Text Generation** LLMs. Note that large models can be hard and take long to run.\n", + "- Get the model Id\n", + "- Initialize the model, and ask it a question/prompt as we did with Falcon model above" + ] + }, + { + "cell_type": "markdown", + "id": "f4edd5e5-44fa-49b0-b150-64a580da8f66", + "metadata": {}, + "source": [ + "### . Prompt templates\n", + "Prompt templates are used for creating prompts in a more modular way, so they can be reused and built on. Chains act as the glue in LangChain; bringing the other components together into workflows that pass inputs and outputs between the different components\n", + "- They are recipes for generating prompts\n", + "- Flexible and modular\n", + "- Can contain: instructions, examples, and additional context" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "0a2faa69-863e-4a4e-9118-ba50e0e72586", + "metadata": {}, + "outputs": [ + { + "ename": "ValidationError", + "evalue": "1 validation error for HuggingFaceHub\ntoken\n extra fields not permitted (type=value_error.extra)", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mValidationError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[21], line 8\u001b[0m\n\u001b[1;32m 5\u001b[0m template \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mYou are an artificial intelligence assistant, answer the question. \u001b[39m\u001b[38;5;132;01m{question}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 6\u001b[0m prompt \u001b[38;5;241m=\u001b[39m PromptTemplate(template\u001b[38;5;241m=\u001b[39mtemplate, input_variables\u001b[38;5;241m=\u001b[39m[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mquestion\u001b[39m\u001b[38;5;124m\"\u001b[39m])\n\u001b[0;32m----> 8\u001b[0m llm \u001b[38;5;241m=\u001b[39m \u001b[43mHuggingFaceHub\u001b[49m\u001b[43m(\u001b[49m\u001b[43mrepo_id\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mMODEL_ID_FALCON\u001b[49m\u001b[43m,\u001b[49m\u001b[43mtoken\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mHUGGINGFACEHUB_API_TOKEN\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 10\u001b[0m \u001b[38;5;66;03m# Create a Chain using the LLMChain() \u001b[39;00m\n\u001b[1;32m 11\u001b[0m llm_chain \u001b[38;5;241m=\u001b[39m LLMChain(prompt\u001b[38;5;241m=\u001b[39mprompt, llm\u001b[38;5;241m=\u001b[39mllm)\n", + "File \u001b[0;32m~/anaconda3/lib/python3.10/site-packages/pydantic/main.py:341\u001b[0m, in \u001b[0;36mpydantic.main.BaseModel.__init__\u001b[0;34m()\u001b[0m\n", + "\u001b[0;31mValidationError\u001b[0m: 1 validation error for HuggingFaceHub\ntoken\n extra fields not permitted (type=value_error.extra)" + ] + } + ], + "source": [ + "from langchain.prompts import PromptTemplate, ChatPromptTemplate\n", + "\n", + "# A String with instructions, same way we create prompts\n", + "# in GUI based interface such as chatGPT\n", + "template = \"You are an artificial intelligence assistant, answer the question. {question}\"\n", + "prompt = PromptTemplate(template=template, input_variables=[\"question\"])\n", + "\n", + "llm = HuggingFaceHub(repo_id=MODEL_ID_FALCON,token=HUGGINGFACEHUB_API_TOKEN)\n", + "\n", + "# Create a Chain using the LLMChain() \n", + "llm_chain = LLMChain(prompt=prompt, llm=llm)\n", + "question = \"What is LangChain?\"\n", + " \n", + "print(llm_chain.run(question))" + ] + }, + { + "cell_type": "markdown", + "id": "5a614f93-417e-4185-96c6-dc0b2ae1704d", + "metadata": {}, + "source": [ + "### Chat Models\n", + "Chat Models are a core component of LangChain. A chat model is a language model that uses chat messages as inputs and returns chat messages as outputs (as opposed to using plain text)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "684669c2-a2f7-4801-b9db-87b702b545e5", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages/langchain_core/_api/deprecation.py:119: LangChainDeprecationWarning: The method `BaseChatModel.__call__` was deprecated in langchain-core 0.1.7 and will be removed in 0.3.0. Use invoke instead.\n", + " warn_deprecated(\n", + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n" + ] + }, + { + "data": { + "text/plain": [ + "AIMessage(content='One of the best places to visit in Malawi is Lake Malawi. This stunning lake is known for its crystal-clear waters, beautiful beaches, and diverse marine life. Visitors can enjoy a variety of water activities such as snorkeling, diving, kayaking, and sailing. The lake is also surrounded by national parks and reserves, offering opportunities for wildlife viewing and hiking. Additionally, the lakeshore is dotted with charming villages where you can experience the local culture and hospitality. Overall, Lake Malawi is a must-visit destination for nature lovers and adventure seekers in Malawi.', response_metadata={'token_usage': {'completion_tokens': 116, 'prompt_tokens': 38, 'total_tokens': 154}, 'model_name': 'gpt-3.5-turbo', 'system_fingerprint': None, 'finish_reason': 'stop', 'logprobs': None}, id='run-11f38ec0-7a4b-47d9-9080-855d38cf0f35-0', usage_metadata={'input_tokens': 38, 'output_tokens': 116, 'total_tokens': 154})" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n" + ] + } + ], + "source": [ + "from langchain_openai import ChatOpenAI\n", + "from langchain.prompts import PromptTemplate, ChatPromptTemplate\n", + "\n", + "llm = ChatOpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)\n", + "\n", + "prompt_template = ChatPromptTemplate.from_messages([\n", + "(\"system\", \"You are a helpful assistant who knows alot about Africa.\"),\n", + "(\"human\",\"Respond to the question: {question}\")]\n", + ")\n", + "\n", + "full_prompt = prompt_template.format_messages(question='What is the best place to visit in Malawi?')\n", + "llm(full_prompt)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e11ea305-a3c9-439f-b135-236e26c39ac1", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "id": "c3e8c433-8672-4ae8-9579-66d5201bc657", + "metadata": {}, + "source": [ + "## 2.2. Managing chat model memory\n", + "- A key feature of chatbot applications is the ability to have a conversation, where context from the conversation is stored and available for the model to access for later questions or reference.\n", + "- Memory is important for conversations with chat models; it opens up the possibility of providing follow-up questions, of building and iterating on model responses, and for chatbots to adapt to the user's preferences and behaviors. \n", + "- Although LangChain allows us to customize and optimize in-conversation chatbot memory, it is still limited by the model's context window. \n", + "- An **LLM's context window** is the amount of input text the model can consider at once when generating a response, and the length of this window varies for different models. LangChain has a standard syntax for optimizing model memory. \n", + "\n", + "There are three LangChain classes for implementing chatbot memory as follows. \n", + "### The ```ChatMessageHistory``` Class\n", + "- The ChatMessageHistory class stores the full history of messages between the user and model. By providing this to the model, we can provide follow-up questions and iterate on the response message.\n", + "- When additional user messages are provided, the model bases its response on the full context stored in the conversation history\n", + "- We can use different tools to manage memory usage in LLM applications, and we can even integrate external data to give the models even more context. \n", + "\n", + "\n", + "### The ```ConversationBufferMemory``` class\n", + "- This gives the application a rolling buffer memory containing the last few messages in the conversation. Users can specify the number of messages to store with the size argument, and the application will discard older messages as newer ones are added. \n", + "- To integrate the memory type into model, we use a special type of chain for conversations: ```ConversationChain```. \n", + "\n", + "### The ```ConversationSummaryMemory``` class\n", + "- Summarizing important points from a conversation can also be a good way of optimizing memory. The ConversationSummaryMemory class summarizes the conversation over time, condensing the information. \n", + "- This means that the chat model can remember key pieces of context without needing to store and process the entire conversation history" + ] + }, + { + "cell_type": "markdown", + "id": "e6b7f3d4-afdd-422c-8688-7a31cb79bb26", + "metadata": {}, + "source": [ + "### Trying out the ChatMessageHistory class" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a98c8f82-8e9a-4603-811b-c20d034ee6b4", + "metadata": {}, + "outputs": [], + "source": [ + "chat = ChatOpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)\n", + "history = ChatMessageHistory()\n", + "history.add_ai_message(\"Hi! Ask me anything please.\")\n", + "history.add_user_message(\"Describe a metaphor for learning LangChain in one sentence.\")\n", + "chat(history.messages)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1248fd89-60d3-4d49-a276-f419417f8e88", + "metadata": {}, + "outputs": [], + "source": [ + "# Ask a question based on the previous messages \n", + "history.add_user_message(\"Summarize the preceding sentence in fewer words\")\n", + "chat(history.messages)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5eeeaed5-cce7-4261-a852-eee1941c808b", + "metadata": {}, + "outputs": [], + "source": [ + "# Ask a question based on the previous messages \n", + "history.add_user_message(\"Summarize the preceding sentence in fewer words\")\n", + "chat(history.messages)" + ] + }, + { + "cell_type": "markdown", + "id": "6e1b5fb0-9fbf-4836-94d0-4017efbdfae0", + "metadata": {}, + "source": [ + "### Trying out the ConversationBufferMemory\n", + "For many applications, storing and accessing the entire conversation history isn't technically feasible. In these cases, the messages must be condensed while retaining as much relevant context as possible. One common way of doing this is with a memory buffer, which stores only the most recent messages based on the parameter ```size```." + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "c8742c04-33db-42fb-8509-987dee9e61d0", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\n", + "\u001b[1m> Entering new ConversationChain chain...\u001b[0m\n", + "Prompt after formatting:\n", + "\u001b[32;1m\u001b[1;3mThe following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know.\n", + "\n", + "Current conversation:\n", + "\n", + "Human: Describe a language model in one sentence\n", + "AI:\u001b[0m\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\u001b[1m> Finished chain.\u001b[0m\n", + "\n", + "\n", + "\u001b[1m> Entering new ConversationChain chain...\u001b[0m\n", + "Prompt after formatting:\n", + "\u001b[32;1m\u001b[1;3mThe following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know.\n", + "\n", + "Current conversation:\n", + "Human: Describe a language model in one sentence\n", + "AI: A language model is a statistical model that is trained on a large corpus of text and is able to generate coherent and grammatically correct sentences based on the patterns and structures it has learned.\n", + "Human: Describe it again using less words\n", + "AI:\u001b[0m\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\u001b[1m> Finished chain.\u001b[0m\n", + "\n", + "\n", + "\u001b[1m> Entering new ConversationChain chain...\u001b[0m\n", + "Prompt after formatting:\n", + "\u001b[32;1m\u001b[1;3mThe following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know.\n", + "\n", + "Current conversation:\n", + "Human: Describe a language model in one sentence\n", + "AI: A language model is a statistical model that is trained on a large corpus of text and is able to generate coherent and grammatically correct sentences based on the patterns and structures it has learned.\n", + "Human: Describe it again using less words\n", + "AI: A language model is a computer program that can generate sentences based on patterns it has learned from a large amount of text.\n", + "Human: Describe it again fewer words but at least one word\n", + "AI:\u001b[0m\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\u001b[1m> Finished chain.\u001b[0m\n", + "\n", + "\n", + "\u001b[1m> Entering new ConversationChain chain...\u001b[0m\n", + "Prompt after formatting:\n", + "\u001b[32;1m\u001b[1;3mThe following is a friendly conversation between a human and an AI. The AI is talkative and provides lots of specific details from its context. If the AI does not know the answer to a question, it truthfully says it does not know.\n", + "\n", + "Current conversation:\n", + "Human: Describe a language model in one sentence\n", + "AI: A language model is a statistical model that is trained on a large corpus of text and is able to generate coherent and grammatically correct sentences based on the patterns and structures it has learned.\n", + "Human: Describe it again using less words\n", + "AI: A language model is a computer program that can generate sentences based on patterns it has learned from a large amount of text.\n", + "Human: Describe it again fewer words but at least one word\n", + "AI: A language model is a program that generates sentences from text patterns.\n", + "Human: What did I first ask you? I forgot.\n", + "AI:\u001b[0m\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "\u001b[1m> Finished chain.\u001b[0m\n" + ] + }, + { + "data": { + "text/plain": [ + "' You asked me to describe a language model in one sentence.'" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n" + ] + } + ], + "source": [ + "from langchain.memory import ChatMessageHistory, ConversationBufferMemory, ConversationSummaryMemory\n", + "from langchain.chains import LLMChain, ConversationChain, RetrievalQA, RetrievalQAWithSourcesChain\n", + "# Create an Open AI Chat Model\n", + "chat = OpenAI(temperature=0, openai_api_key=OPENAI_API_KEY)\n", + "\n", + "# Create the memory object with size set to 2\n", + "memory = ConversationBufferMemory(size=4)\n", + "buffer_chain = ConversationChain(llm=chat, memory=memory, verbose=True)\n", + "\n", + "# \n", + "buffer_chain.predict(input=\"Describe a language model in one sentence\")\n", + "buffer_chain.predict(input=\"Describe it again using less words\")\n", + "buffer_chain.predict(input=\"Describe it again fewer words but at least one word\")\n", + "buffer_chain.predict(input=\"What did I first ask you? I forgot.\")" + ] + }, + { + "cell_type": "markdown", + "id": "efd52d16-a392-4aad-83e7-2e044b6d0c43", + "metadata": {}, + "source": [ + "**EXERCISE-2. For the ```ConversationBufferMemory```, change the buffer size to 1 or 2 and see what happens**" + ] + }, + { + "cell_type": "markdown", + "id": "8e535976-cabe-4162-b8ca-e84532dc783c", + "metadata": {}, + "source": [ + "## ConversationSummaryMemory\n", + "For longer conversations, storing the entire memory, or even a long buffer memory, may not be technically feasible. In these cases, a summary memory implementation can be a good option. Summary memories summarize the conversation at each step to retain the key context for the model to use. This works by using another LLM for generating the summaries, alongside the LLM used for generating the responses." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e7325752-2d3d-499e-b7ae-31122b1f64d5", + "metadata": {}, + "outputs": [], + "source": [ + "# ==============================================\n", + "# PLEASE FOLLOW INSTRUCTIONS AND COMPLETE CODE\n", + "# ==============================================\n", + "\n", + "# Use openAI model from earlier as a summary model\n", + "summary_llm = YOUR CODE HERE\n", + "\n", + "# Complete code below by putting in summary model above\n", + "memory = ConversationSummaryMemory(llm=summary_llm)\n", + "\n", + "# Create a chat model to use in the Conversation chain below (refer\n", + "# previous cells where we created OpenAI chat model\n", + "chat_model = YOUR CODE HERE\n", + "\n", + "# Create a conversation chain as we did before \n", + "summary_chain = YOUR CODE HERE\n", + "\n", + "summary_chain.predict(input=\"Please tell me about Malawi.\")\n", + "summary_chain.predict(input=\"Does that affect Malawi's income?\")" + ] + }, + { + "cell_type": "markdown", + "id": "f4c250ad-6f8a-42a6-99d5-bdb7b75df1e1", + "metadata": {}, + "source": [ + "# 3. Adding External Documents to LLMs\n", + "As mentioned in the lectures, LLMs are trained on a specific dataset (often publicly available internet data) up to some point in time. Therefore, if you have some custom organization documents or data, the LLMs will not be able to provide answers based on that information. Furthermore, if there is any new information which came after the LLM was trained, the LLM will not have that information either. \n", + "\n", + "The main remedy to deal with this is to provide the LLM with external documents. Adding external documents further helps with **hallucinations** as the LLM has little opportunity to make up stuff (hallucinate) when it has access to this extra knowledge.\n", + "\n", + "In LangChain, there are three main steps to provide external documents to the LLM (essentially create a Retrieval Augmented Generation)-**RAG Chatbot**\n", + "1. Identify the data sources (documents, datasets, websites, databases etc).\n", + "\n", + "2. Load the documents into LangChain using document loaders. LangChain can work with different document sources, please see [the documentation](https://python.langchain.com/v0.1/docs/integrations/document_loaders/). \n", + "\n", + "3. Splitting the documents into chunks. \n", + "\n", + "4. Create vector embeddings and store into a vector database for retrievval" + ] + }, + { + "cell_type": "markdown", + "id": "8eb4a208-8c25-4060-bf4b-c4eb06e26557", + "metadata": {}, + "source": [ + "### 3.1 Document Loaders\n", + "LangChain has more than 160 document loaders. Some loaders are provided by 3rd parties who manage unique document formats. These include Amazon S3, Microsoft, Google Cloud, Jupyter notebooks, pandas DataFrames, unstructured HTML, YouTube audio transcripts, and more. " + ] + }, + { + "cell_type": "markdown", + "id": "9efebd7e-59c0-4aef-81a3-5d9a500d1319", + "metadata": {}, + "source": [ + "#### PDF Document Loader\n", + "- Requires installation of the ```pypdf``` package as a dependency.\n", + "- There are many different types of PDF loaders in LangChain, and there is documentation available online for each." + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "fc21c844-25b1-4447-a7ea-4aff7ad450ed", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Requirement already satisfied: pypdf in /Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages (3.8.1)\n" + ] + } + ], + "source": [ + "!pip install pypdf" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "417c534c-4a01-49a4-9c8f-16450dec011a", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.document_loaders import PyPDFLoader\n", + "loader = PyPDFLoader(str(FILE_DENGUE))\n", + "data = loader.load()\n", + "print(data[0])" + ] + }, + { + "cell_type": "markdown", + "id": "bdb6fbaf-34c0-4fc6-8570-76aa853e78a5", + "metadata": {}, + "source": [ + "**EXERCISE-3. Explore other LangChain Loaders**\n", + "\n", + "Check the LangChain [document loaders documentation](https://python.langchain.com/v0.1/docs/integrations/document_loaders/) \n", + "and also check [here](https://python.langchain.com/v0.1/docs/modules/data_connection/) for most commonly used loaders.\n", + "1. Identify 5 document loaders you find interesting. What are third party document loaders?\n", + "2. **HTML loaders**. Explore the html or webpage loaders. \n", + "3. Pick one of your favourite webpages and load it using the ```UnstructuredHTMLLoader``` loader module. Refer to the [documentation](UnstructuredHTMLLoader) on how to import the module.\n", + "4. How do you think this changes your approach to ```web-scraping```. Do you think web scraping will change or not with this new capabilities to just connect to a website and query it?" + ] + }, + { + "cell_type": "markdown", + "id": "07bde0cf-56cb-4726-a3f0-cfc839ba1d3e", + "metadata": {}, + "source": [ + "### 3.2 Preparing documents for vector database and retrieval\n", + "In this stage, there are two sub-steps:\n", + "- The document is split to enhance efficiency in storage, indexing and ultimately efficient retrieval. Furthermore, chunking also helps with ensuring the document (which act as context) can fit in the context window \n", + "- An embedding model is used to convert the documents into ```vector embeddings```\n", + "- The vectorized data is stored into a vector database." + ] + }, + { + "cell_type": "markdown", + "id": "ba116fde-3896-40d2-b921-18c2de13b56d", + "metadata": {}, + "source": [ + "#### Splitting/Chunking Documents\n", + "- Given a PDF document, one naive splitting option would be to separate the document into lines as they appear in the document. This would be simple to implement but could be problematic. Key context required for understanding one line is often found in a different line, and these lines would be processed separately, so we need another strategy which can maintain context across pieces of texts in the document-enter the **overlap concept**.\n", + "We will compare two document splitting methods from LangChain. \n", + ">- **CharacterTextSplitter** splits text based on a specified separator, looking at individual characters. This method splits based on the separator first, then evaluates chunk size and chunk overlap.\n", + ">- **RecursiveCharacterTextSplitter** attempts to split by several separators recursively until the chunks fall within the specified chunk size. There are many other methods that use natural language processing to infer meaning and split appropriately. Optimizing this is an active area of research.\n", + "\n", + "There isn't one strategy that works for all situations when it comes to splitting documents. \n", + "It's often the case of experimenting with multiple methods, and seeing which one strikes the right balance between retaining sufficient context and managing chunk size." + ] + }, + { + "cell_type": "markdown", + "id": "633a6334-4e51-4e28-b2a9-1967fd36d6b7", + "metadata": {}, + "source": [ + "##### CharacterTextSplitter" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "9cabe5b1-557f-4480-83ef-9637682546c5", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Created a chunk of size 52, which is longer than the specified 24\n" + ] + } + ], + "source": [ + "from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter\n", + "quote = 'One machine can do the work of fifty ordinary humans.\\\n", + "No machine can do the work of one extraordinary human.'\n", + "\n", + "chunk_size = 24\n", + "chunk_overlap = 3\n", + "\n", + "ct_splitter = CharacterTextSplitter(separator=\".\", \n", + " chunk_overlap=chunk_overlap, chunk_size=chunk_size)\n", + "\n", + "docs = ct_splitter.split_text(quote)" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "2a6a8893-4f70-40db-a1a2-66055f532d3d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['One machine can do the work of fifty ordinary humans',\n", + " 'No machine can do the work of one extraordinary human']" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "docs" + ] + }, + { + "cell_type": "markdown", + "id": "08337229-f503-429b-8345-e5d987f0d774", + "metadata": {}, + "source": [ + "##### RecursiveCharacterTextSplitter" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "431f2615-19d8-45a7-b626-f78e45332534", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['One machine can do the', 'work of fifty ordinary', 'humans.No machine can', 'do the work of one', 'extraordinary human.']\n" + ] + } + ], + "source": [ + "# Using the same variables: chunk_size and chunk_overlap, instatiate RecursiveCharacterTextSplitter\n", + "rc_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap = chunk_overlap)\n", + "docs = rc_splitter.split_text(quote)\n", + "print(docs)" + ] + }, + { + "cell_type": "markdown", + "id": "8777067d-35d7-471d-983f-fec0a6aafbc0", + "metadata": {}, + "source": [ + "#### Load data into a vector database\n", + "At this stage, you will be faced with a decision to choose which vector database to use. \n", + "For our simple demonstration purpose, we will use [chromadb](https://www.trychroma.com), an open source vector database solution. The type of vector database solution you choose can depend on numerous factors such as:\n", + "- How large are the documents you will be processing\n", + "- How much money you have to spend on the project\n", + "- Efficiency/latency requirements for your use case, if you need to provide solution in real-time/fast, you may need a different solution\n", + "- Accuracy requirements. Sometimes there is a tradeoff between accuracy and latecy.\n", + "- Integration requirements with existing platforms. In somecases, people use ```PostgreSQL``` because they are already using it and it has enough add on extensions for vector database capabilities.\n", + "\n", + "Another decision choice is the **embedding model**- the LLM which converts the text/documents into vectors. There are many options on the market and the choice comes down to things such as:\n", + "- Available budget\n", + "- Compatibility with the LLM you are using in the generation phase. People do use a different embedding model from the generation model\n", + "> embedding_llm = Mistral, \n", + "> chat_model = ChatOpenAI\n", + "- Nature of documents, size and alot of other factors" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "79cbfc3e-d388-4410-8dff-56a46a47c53d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Collecting sentence_transformers\n", + " Downloading sentence_transformers-3.0.0-py3-none-any.whl (224 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m224.7/224.7 kB\u001b[0m \u001b[31m827.6 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: huggingface-hub>=0.15.1 in /Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages (from sentence_transformers) (0.23.2)\n", + "Requirement already satisfied: scipy in /Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages (from sentence_transformers) (1.10.1)\n", + "Requirement already satisfied: scikit-learn in /Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages (from sentence_transformers) (1.2.2)\n", + "Requirement already satisfied: torch>=1.11.0 in /Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages (from sentence_transformers) (2.1.2)\n", + "Requirement already satisfied: tqdm in /Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages (from sentence_transformers) (4.65.0)\n", + "Requirement already satisfied: Pillow in /Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages (from sentence_transformers) (9.4.0)\n", + "Requirement already satisfied: transformers<5.0.0,>=4.34.0 in /Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages (from sentence_transformers) (4.41.2)\n", + "Requirement already satisfied: numpy in /Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages (from sentence_transformers) (1.23.5)\n", + "Requirement already satisfied: fsspec>=2023.5.0 in /Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages (from huggingface-hub>=0.15.1->sentence_transformers) (2023.12.2)\n", + "Requirement already satisfied: typing-extensions>=3.7.4.3 in /Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages (from huggingface-hub>=0.15.1->sentence_transformers) (4.8.0)\n", + "Requirement already satisfied: packaging>=20.9 in /Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages (from huggingface-hub>=0.15.1->sentence_transformers) (23.2)\n", + "Requirement already satisfied: pyyaml>=5.1 in /Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages (from huggingface-hub>=0.15.1->sentence_transformers) (6.0.1)\n", + "Requirement already satisfied: filelock in /Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages (from huggingface-hub>=0.15.1->sentence_transformers) (3.9.0)\n", + "Requirement already satisfied: requests in /Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages (from huggingface-hub>=0.15.1->sentence_transformers) (2.28.1)\n", + "Requirement already satisfied: networkx in /Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages (from torch>=1.11.0->sentence_transformers) (2.8.4)\n", + "Requirement already satisfied: sympy in /Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages (from torch>=1.11.0->sentence_transformers) (1.12)\n", + "Requirement already satisfied: jinja2 in /Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages (from torch>=1.11.0->sentence_transformers) (3.1.2)\n", + "Requirement already satisfied: regex!=2019.12.17 in /Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages (from transformers<5.0.0,>=4.34.0->sentence_transformers) (2022.7.9)\n", + "Requirement already satisfied: tokenizers<0.20,>=0.19 in /Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages (from transformers<5.0.0,>=4.34.0->sentence_transformers) (0.19.1)\n", + "Requirement already satisfied: safetensors>=0.4.1 in /Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages (from transformers<5.0.0,>=4.34.0->sentence_transformers) (0.4.3)\n", + "Requirement already satisfied: threadpoolctl>=2.0.0 in /Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages (from scikit-learn->sentence_transformers) (2.2.0)\n", + "Requirement already satisfied: joblib>=1.1.1 in /Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages (from scikit-learn->sentence_transformers) (1.1.1)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages (from jinja2->torch>=1.11.0->sentence_transformers) (2.1.1)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages (from requests->huggingface-hub>=0.15.1->sentence_transformers) (2024.2.2)\n", + "Requirement already satisfied: charset-normalizer<3,>=2 in /Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages (from requests->huggingface-hub>=0.15.1->sentence_transformers) (2.1.1)\n", + "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages (from requests->huggingface-hub>=0.15.1->sentence_transformers) (1.26.15)\n", + "Requirement already satisfied: idna<4,>=2.5 in /Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages (from requests->huggingface-hub>=0.15.1->sentence_transformers) (3.4)\n", + "Requirement already satisfied: mpmath>=0.19 in /Users/dunstanmatekenya/anaconda3/lib/python3.10/site-packages (from sympy->torch>=1.11.0->sentence_transformers) (1.3.0)\n", + "Installing collected packages: sentence_transformers\n", + "Successfully installed sentence_transformers-3.0.0\n" + ] + } + ], + "source": [ + "!pip install sentence_transformers" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1ad4c76d-7248-49d6-acf2-3c1193bd2dcb", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_openai import OpenAIEmbeddings\n", + "from langchain_community.vectorstores import Chroma\n", + "from langchain_community.embeddings import HuggingFaceEmbeddings\n", + "\n", + "\n", + "# Lets load the Cholera paper and then store it in a database\n", + "loader = PyPDFLoader(str(FILE_HEP_CHAD))\n", + "data = loader.load()\n", + "\n", + "chunk_size = 100\n", + "chunk_overlap = 10\n", + "\n", + "# Split with RecursiveCharacterTextSplitter\n", + "rc_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap = chunk_overlap)\n", + "docs = rc_splitter.split_documents(data)\n", + "\n", + "# Lets use openAI embedding model\n", + "#embedding_model = OpenAIEmbeddings(openai_api_type=OPENAI_API_KEY)\n", + "embedding_model = HuggingFaceEmbeddings(model_name=\"all-MiniLM-L6-v2\")\n", + "# Directory to store our database-set this to the data directory\n", + "vectordb = Chroma(persist_directory=str(DIR_DATA), embedding_function=embedding_model)\n", + "\n", + "# Store the databse\n", + "vectordb.persist()\n", + "\n", + "# Create the database\n", + "docstorage = Chroma.from_documents(docs, embedding_model)" + ] + }, + { + "cell_type": "markdown", + "id": "e30d865a-42fb-4325-b53c-84da656a0703", + "metadata": {}, + "source": [ + "**EXERCISE-4. Explore what functionality is available under the database object ```docstorage_cholera```**\n", + "- You can use ```dir(object)``` to check available attributes and functions\n", + "- Note that there many search related functions which enables you to control how user queries are searcherd when building Chatbots" + ] + }, + { + "cell_type": "markdown", + "id": "52ea226f-8de3-4c5c-a69e-9d8971b047cf", + "metadata": {}, + "source": [ + "### 3.3 Retrieval\n", + "Now that we have added our external file. Lets use the added document as context in our LLM chains and ask questions again." + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "5af5276c-4235-44c8-926f-652acf3d16dd", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "============================================================\n", + "LLM Output without using RAG-external document from WHO website\n", + "============================================================\n", + "\n", + "\n", + "As of September 2021, there are several ongoing disease outbreaks in Chad. These include:\n", + "\n", + "1. COVID-19: Chad has been experiencing a surge in COVID-19 cases since April 2021, with a peak in July. As of September 2021, there have been over 5,000 confirmed cases and over 170 deaths.\n", + "\n", + "2. Cholera: A cholera outbreak was declared in June 2021 in the Lake Chad region, affecting areas near the border with Nigeria. As of September 2021, there have been over 2,000 suspected cases and 50 deaths.\n", + "\n", + "3. Measles: Chad has been experiencing a measles outbreak since January 2020. As of September 2021, there have been over 20,000 suspected cases and over 300 deaths, mainly affecting children under the age of 5.\n", + "\n", + "4. Yellow fever: A yellow fever outbreak was declared in November 2020, affecting several regions in Chad. As of September 2021, there have been over 60 confirmed cases and 10 deaths.\n", + "\n", + "5. Meningitis: Chad is currently experiencing a meningitis outbreak, with over 2,000 suspected cases and 200 deaths reported since the beginning of 2021.\n", + "\n", + "\n", + "\n", + "============================================================\n", + "LLM Output with RAG-external document from WHO website\n", + "============================================================\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n", + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n", + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Yes, there is currently a hepatitis E outbreak in Chad, specifically in the eastern Ouaddai province. This outbreak was last reported on May 8, 2024.\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n" + ] + } + ], + "source": [ + "from langchain.chains import RetrievalQA\n", + "\n", + "# Create LLM as before \n", + "llm = OpenAI(openai_api_key=OPENAI_API_KEY)\n", + "\n", + "# Create retriever with \n", + "qa = RetrievalQA.from_chain_type(llm=llm, chain_type=\"stuff\", retriever=docstorage.as_retriever())\n", + "\n", + "# The question we will ask the LLM\n", + "# You can ask these questions in French and LLM will also answer in French\n", + "question = \"Are there any disease outbreaks in Chad?\"\n", + "\n", + "# Answer without RAG\n", + "output = llm.invoke(question)\n", + "print()\n", + "print(\"=\"*60)\n", + "print(\"LLM Output without using RAG-external document from WHO website\")\n", + "print(\"=\"*60)\n", + "print(output)\n", + "\n", + "# For RAG Chain, we put in the question as dictionary\n", + "print()\n", + "print(\"=\"*60)\n", + "print(\"LLM Output with RAG-external document from WHO website\")\n", + "print(\"=\"*60)\n", + "print(qa.run(question))" + ] + }, + { + "cell_type": "markdown", + "id": "3c735436-9e4a-410d-8f37-4f290cf51e1b", + "metadata": {}, + "source": [ + "**EXERCISE-5. Implement a simple RAG as we did above**\n", + "1. Use the ```FILE_MIDDLE_EAST_COVID``` file to create a new Chroma database\n", + "2. Implement a RAG chainas we did above.\n", + "3. Compare answers between a the LLM with RAG and no RAG\n", + "\n", + "**Hint.** Copy and paste the code from above and edit it." + ] + }, + { + "cell_type": "markdown", + "id": "6375cfcc-c6a9-406a-8e72-cd0a04a9b2ac", + "metadata": {}, + "source": [ + "### 3.4 Retrieval with sources reference\n", + "In reallife applications, you will have hundreds or thousands of documents. A user of your system may need to know the spurce of the answrs they are getting. Most RAG systems are able to provide details of where the information is coming from. For example, in the RAG-Malawi example, the RAG system can provide the page numbers. In this case, with LangChain, you can you can just provide information about the document where the answer came from.\n", + "\n", + "One method of mitigating the risk of LLM hallucinations from RAG is using RetrievalQAWithSourcesChain, which also returns the data source of the answer. Aside from the chain class, the code is exactly the same as RetrievalQA. However, this class returns a dictionary containing a 'sources' key and an 'answer' key. The 'sources' key refers to the file where the answer came from, which is helpful when there are many documents in the database." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "399ddc48-1e99-43a4-ae28-298d5427b367", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.chains import RetrievalQAWithSourcesChain\n", + "\n", + "qa = RetrievalQAWithSourcesChain.from_chain_type(llm=llm, chain_type=\"stuff\", retriever=docstorage.as_retriever())\n", + "\n", + "results = qa({\"question\": \"Are there any disease outbreaks in Chad?\"},\n", + " return_only_outputs=True)\n", + "print(results)" + ] + }, + { + "cell_type": "markdown", + "id": "d7b0f265-8583-4357-9040-8dd75429179c", + "metadata": {}, + "source": [ + "# 4. LangChain Expression Language (LCEL)\n", + "> In summary, LCEL is a different (recommended) syntax of achieving the same things we have done in LangChain\n", + "\n", + "LCEL is a key part of the LangChain toolkit. We can use it to connect prompts, models, and retrieval components using a **pipe (|)** operator rather than task-specific classes. It also lets us create complex workflows that work well in production environments. These chains have built-in support for batch processing, streaming, and asynchronous execution. This makes it easy to integrate with other LangChain tools and utilities like **LangSmith** and **LangServe**.\n", + "\n", + "A few notes about the chain with LCEL\n", + "- The ```| (pipe)``` in LCEL indicates that the output from one component will be used as the input to the next." + ] + }, + { + "cell_type": "markdown", + "id": "8229521e-790e-41b8-9fd8-159a54cae8c7", + "metadata": {}, + "source": [ + "## 4.1 A Simple Chain with LCEL" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "88efb378-36b2-4499-a033-c3145101475e", + "metadata": {}, + "outputs": [], + "source": [ + "model = ChatOpenAI(openai_api_key=OPENAI_API_KEY)\n", + "prompt = ChatPromptTemplate.from_template(\"You are a helpful personal assistant. \\\n", + "Answer the following question: {question}\")\n", + "\n", + "# Create Chain in LCEL fashion\n", + "llm_chain = prompt | model\n", + "\n", + "# Recall how we created a chain before \n", + "#llm_chain = LLMChain(prompt=prompt, llm=llm)\n", + "\n", + "\n", + "# Run using invoke\n", + "print(llm_chain.invoke(\"What is the capital of Tunisia?\"))" + ] + }, + { + "cell_type": "markdown", + "id": "1ac8949f-0ad0-4d22-8dbf-e8fd992f4065", + "metadata": {}, + "source": [ + "## 4.2 RAG with LCEL" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "68b9b2eb-db18-49e2-b75e-2c27fb862f2e", + "metadata": {}, + "outputs": [], + "source": [ + "model = ChatOpenAI(openai_api_key = OPENAI_API_KEY)\n", + "\n", + "embedding_model = OpenAIEmbeddings(openai_api_key = OPENAI_API_KEY)\n", + "vectorstore = Chroma.from_texts([\"Dunstan stayed in Tunis, the capital of Tunisia from Sunday May 26 to Satarday May 31.\"],embedding=embedding_model)\n", + "retriever = vectorstore.as_retriever()\n", + "\n", + "template = \"\"\"Answer the question based on the context:{context}. Question: {question}\"\"\"\n", + "prompt = ChatPromptTemplate.from_template(template)\n", + "\n", + "chain = ({\"context\": retriever,\"question\": RunnablePassthrough()} | prompt | model | StrOutputParser())\n", + "chain.invoke(\"When did Dunstan visit Tunisia?\")" + ] + }, + { + "cell_type": "markdown", + "id": "c6ac7202-4bb2-446d-85bf-e55bed0f53b1", + "metadata": {}, + "source": [ + "## 4.3 More things you can do with LCEL\n", + "There are alot of things you can do with LCEL. For example,\n", + "- **Batch or Streaming**. LCEL chains can be run in ```batch``` mode or ```streaming``` mode\n", + "- **Sequential chains.**. Sequential chains utilize step-by-step processing of inputs, where the output from one step becomes the input for the next. This enables a clear and organized flow of information within the chain. They provide flexibility in constructing custom pipelines by combining different components, such as prompts, models, retrievers, and output parsers, to suit specific use cases and requirements.\n", + "- **Passing Data Across Chains.** There are many cases where your application will require the use of several chains that pass outputs between them" + ] + }, + { + "cell_type": "markdown", + "id": "826bc2fc-cb3b-4d23-abab-52451461e0c4", + "metadata": {}, + "source": [ + "### Using sequential chaining to create Python code and check it with LCEL" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8bc0c20c-167c-454c-a87a-db01aa8f2855", + "metadata": {}, + "outputs": [], + "source": [ + "coding_prompt = PromptTemplate.from_template(\n", + " \"\"\"Write Python code to loop through the following list, printing each element: {list}\"\"\")\n", + "validate_prompt = PromptTemplate.from_template(\n", + " \"\"\"Consider the following Python code: {answer} If it doesn't use a list comprehension, update it to use one. If it does use a list comprehension, return the original code without explanation:\"\"\")\n", + "\n", + "llm = ChatOpenAI(openai_api_key=OPENAI_API_KEY)\n", + "\n", + "# Create the sequential chain\n", + "chain = ({\"answer\": coding_prompt | llm | StrOutputParser()}\n", + " | validate_prompt\n", + " | llm \n", + " | StrOutputParser() )\n", + "\n", + "# Invoke the chain with the user's question\n", + "print(chain.invoke({\"list\": \"[3, 1, 4, 1]\"}))" + ] + }, + { + "cell_type": "markdown", + "id": "2ae9fb57-5565-4bdf-a977-67f735260e51", + "metadata": {}, + "source": [ + "# 5. LangChain Agents\n", + "In LLMs and Gen AI, the idea behind agents is to use language models to determine which a sequence of actions to take to meet a pre-defined objective. Thus, the LLM is able solve complex problems or perform complex tasks by planning, determing what tools to use and what knowledge to get until the task is solved without explicit supervision.\n", + "\n", + "- Agents often use tools, which, in LangChain, are functions used by the agent to interact with the system. These tools can be high-level utilities to transform inputs, or they can be specific to a series of tasks. Agents can even use chains and other agents as tools!\n", + "- In LangChain, there different agent types. See [this documentation](https://python.langchain.com/v0.1/docs/modules/agents/agent_types/) for explanation of how the agents are categorized. \n", + "## Components of a LangChain Agent\n", + "There are four primary components to LangChain agents. \n", + "- The user input in the form of a prompt represents the initial input provided by the user. \n", + "- The definition for handling the intermediate steps explains how to handle and process actions during the agent's execution. \n", + "- The agent also needs to have a definition for the tools and model behavior to execute. \n", + "- The output parser formats the output generated by the model into the most appropriate format for the use case. Agents can be defined for specificity or high-level thought processes." + ] + }, + { + "cell_type": "markdown", + "id": "00fa06af-c231-4237-8d62-dc42ff0f59de", + "metadata": {}, + "source": [ + "## 5.1 Zero-Shot ReAct agent\n", + "ReAct stands for **Reasoning and Acting**. This simplifies the answer to infer as much context as possible. \n", + "We start by importing the initialize_agent function and AgentType for agent creation and configuration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a1e3cf87-c98c-4dfa-9ab4-65bdee6f47df", + "metadata": {}, + "outputs": [], + "source": [ + "from langchain.agents import initialize_agent, AgentType, load_tools\n", + "\n", + "# Define LLM\n", + "llm = OpenAI(model_name=\"gpt-3.5-turbo-instruct\", temperature=0, openai_api_key=OPENAI_API_KEY)\n", + "\n", + "# Define what tools the agent will will use, it can be more than one tool\n", + "tools = load_tools([\"llm-math\"], llm=llm)\n", + "agent = initialize_agent(tools, llm, agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, verbose=True)\n", + "agent.run(\"What is 10 multiplied by 50?\")" + ] + }, + { + "cell_type": "markdown", + "id": "aab0fb85-881a-4b00-b1a6-1cee7d7a3f75", + "metadata": {}, + "source": [ + "## 5.2 Other Agents \n", + "There are alot of other agents and tools in LangChain. For example, in order to interact with a database or structured dataset we will utilise an ```SQLAgent```" + ] + }, + { + "cell_type": "markdown", + "id": "971ae0ee-c01d-4fea-b992-110d6c7e0edf", + "metadata": {}, + "source": [ + "# 6. Evaluating LLM Outputs in LangChain\n", + "As mentioned in Lectures, its important to evaluate LLM model outputs as well as all ML based outputs fot that matter. \n", + "Although Gen AI may seem very smart, the models still make alot of mistakes. As such, evaluating AI applications is important for several reasons. \n", + "- First, it checks if the AI model can accurately interpret and respond to a variety of inputs. This is vital in applications where responses inform decision-making, and reliability is paramount. \n", + "- Evaluation also help identify the strengths and weaknesses of a model, which allows for targeted and continuous improvements, and builds trust among users and stakeholders. \n", + "- Evaluation allows us to re-align model output with human intent, getting to the ideal responses faster.\n", + "\n", + "## LangChain evaluation tools\n", + "LangChain has built-in evaluation tools for comparing model outputs based on common criteria, such as relevance and correctness. It also provides tools for defining custom criteria, which we can tailor to specific use cases. Finally, the ```QAEvalChain class``` is another tool that can be used to measure how well an AI's response answers a specific question using ground truth responses." + ] + }, + { + "cell_type": "markdown", + "id": "abf96d76-5901-4fe0-83b5-881e6e340b92", + "metadata": {}, + "source": [ + "## 6.1 LangChain Built-in Evaluation Metrics" + ] + }, + { + "cell_type": "markdown", + "id": "c7752f75-65d4-4a91-83dc-3b4740e544d9", + "metadata": {}, + "source": [ + "**EXERCISE-6: Explore Evalution Metrics in LangChain**\n", + "- run this import statement: ```from langchain.evaluation import Criteria```\n", + "- use ``list`` function pn Criteria to check the list of available functions" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "ab44f6f3-2489-4670-88f4-a3361c6a7fc3", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'reasoning': 'Step 1: Identify the criterion - relevance.\\n\\nStep 2: Read the input and submission to determine if they are referring to a real quote from the text.\\n\\nStep 3: The input is asking a math question, not referring to a quote from the text.\\n\\nStep 4: The submission is referring to a different topic, the capital of New York state, and not a quote from the text.\\n\\nStep 5: Therefore, the submission does not meet the criterion of relevance.\\n\\nConclusion: The submission does not meet the criterion of relevance.', 'value': 'N', 'score': 0}\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n" + ] + } + ], + "source": [ + "from langchain.chat_models import ChatOpenAI\n", + "from langchain.evaluation import load_evaluator\n", + "\n", + "\n", + "llm = OpenAI(openai_api_key=OPENAI_API_KEY)\n", + "\n", + "evaluator = load_evaluator(\"criteria\", criteria=\"relevance\",llm=llm)\n", + "eval_result = evaluator.evaluate_strings(prediction=\"The capital of New York state is Albany\",input=\"What is 26 + 43?\")\n", + "print(eval_result)" + ] + }, + { + "cell_type": "markdown", + "id": "28a2c168-9b70-427d-9e0b-231212ec7699", + "metadata": {}, + "source": [ + "**EXERCISE-7: Try doing the same evaluation above with a different LLM (e.g., Mistral)**" + ] + }, + { + "cell_type": "markdown", + "id": "ce53d768-a26c-4509-948e-c464ebd20310", + "metadata": {}, + "source": [ + "## 6.2 Defining Custom Metrics\n", + "To customize the criteria, we need to evaluate the specific use case and define a dictionary named custom_criteria. This example adds simplicity, bias, clarity, and truthfulness criteria. Custom criteria work by mapping criteria names to the questions that are used to evaluate the strings. To use these new criteria, create an evaluator object, but this time, using our custom_critera." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3b248976-85a5-4ef3-9998-48cf1afa9311", + "metadata": {}, + "outputs": [], + "source": [ + "custom_criteria = {\"simplicity\": \"Does the language use brevity?\",\n", + " \"bias\": \"Does the language stay free of human bias?\",\n", + " \"clarity\": \"Is the writing easy to understand?\",\n", + " \"truthfulness\": \"Is the writing honest and factual?\"}\n", + "\n", + "evaluator = load_evaluator(\"criteria\", criteria=custom_criteria,\n", + " llm=llm)\n", + "eval_result = evaluator.evaluate_strings(input=\"What is the best Italian restaurant in New York City?\",\n", + "prediction=\"That is a subjective statement and I cannot answer that.\")\n", + "print(eval_result)" + ] + }, + { + "cell_type": "markdown", + "id": "ef61e633-de72-4019-940c-7021b0e7c2e1", + "metadata": {}, + "source": [ + "## 6.3 QAEvalChain\n", + "Question-Answering (QA) is one of the most popular applications LLMs. But it is often not always obvious to determine what parameters (e.g., chunk size) or components (e.g., model choice, VectorDB) yield the best QA performance in the system we are building. The QA eval chain is an LLM chain for evaluting performance of an LLM on QA task. Refer to this detailed [LangChain blog post](https://blog.langchain.dev/auto-eval-of-question-answering-tasks/) for details about QAEvalChain." + ] + }, + { + "cell_type": "markdown", + "id": "1585ea3a-1be2-40db-9c88-774c03e220a7", + "metadata": {}, + "source": [ + "### 6.3.1 Trying out QAEvalChain\n", + "As a metric, QAEvalChain focuses on the **accuracy** and **relevance** of the response. In this chain, RAG will be used to store the document and ground truth responses, and an evaluation model instance is used to compare the semantic meaning of a model's results with the ground truth. \n", + "\n", + "First, we load our data source, in this case, a PDF document, and split it into chunks. Next, we set up the embeddings model, vector database, and LLM, and combine them in a chain. The input_key is set to \"question\", as questions will be used to query the database" + ] + }, + { + "cell_type": "markdown", + "id": "3e59bac0-ed4c-4ffe-b64c-423f88f1aab3", + "metadata": {}, + "source": [ + "### Create a RAG Retriever " + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "2fc52a82-7b90-4b0f-953a-32cb90beee54", + "metadata": {}, + "outputs": [], + "source": [ + "# Lets load the Cholera paper and then store it in a database\n", + "loader = PyPDFLoader(str(FILE_DENGUE))\n", + "data = loader.load()\n", + "\n", + "chunk_size = 100\n", + "chunk_overlap = 50\n", + "\n", + "# Split with RecursiveCharacterTextSplitter\n", + "rc_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap = chunk_overlap)\n", + "docs = rc_splitter.split_documents(data)\n", + "\n", + "# Lets use openAI embedding model\n", + "embedding_model = OpenAIEmbeddings(openai_api_type=OPENAI_API_KEY)\n", + "\n", + "# Directory to store our database-set this to the data directory\n", + "vectordb = Chroma(persist_directory=str(DIR_DATA), embedding_function=embedding_model)\n", + "\n", + "# Store the databse\n", + "vectordb.persist()\n", + "\n", + "# Create the database\n", + "docstorage = Chroma.from_documents(docs, embedding_model)\n", + "\n", + "# LLM\n", + "llm = OpenAI(model_name=\"gpt-3.5-turbo-instruct\", openai_api_key=OPENAI_API_KEY)\n", + "\n", + "# Define the retriever chain\n", + "qa = RetrievalQA.from_chain_type(llm=llm, chain_type=\"stuff\", retriever=docstorage.as_retriever(), input_key=\"question\")" + ] + }, + { + "cell_type": "markdown", + "id": "a9dddef3-e7ab-4c3e-9086-121be7b3b8a4", + "metadata": {}, + "source": [ + "## Define a Question Set as Key-Value Pairs in a Dict\n", + "This is a ground-truth dataset which a list of questions and their correct responses." + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "f4d95da4-f130-4a2f-b0e5-40d22146674e", + "metadata": {}, + "outputs": [], + "source": [ + "question_set = [{\"question\": \"Did dengue cases increase in 2023?\",\n", + " \"answer\": \"Yes, in 2023, there was an increase in cases globally.\"},\n", + " {\"question\": \"According to the document, which are the top four regions affected by arboviral diseases?\",\n", + " \"answer\": \"Africa is oe of the top four regions\"},\n", + " {\"question\": \"How is dengue virus transimitted to humans?\",\n", + " \"answer\": \"through the bite of infected mosquitoes\"}]" + ] + }, + { + "cell_type": "markdown", + "id": "7a509f17-4bec-4bdc-805e-321b37581a84", + "metadata": {}, + "source": [ + "## Run QAEVAL" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "5dff29e4-037b-464f-9135-3f311daf4727", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n", + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n", + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n", + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n", + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n", + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n", + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[{'results': ' CORRECT'}, {'results': ' INCORRECT'}, {'results': ' CORRECT'}]\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Failed to batch ingest runs: LangSmithError('Failed to POST https://api.smith.langchain.com/runs/batch in LangSmith API. HTTPError(\\'403 Client Error: Forbidden for url: https://api.smith.langchain.com/runs/batch\\', \\'{\"detail\":\"Forbidden\"}\\')')\n" + ] + } + ], + "source": [ + "from langchain.evaluation import QAEvalChain\n", + "predictions = qa.apply(question_set)\n", + "eval_chain = QAEvalChain.from_llm(llm)\n", + "\n", + "results = eval_chain.evaluate(question_set,predictions, question_key=\"question\",prediction_key=\"result\", answer_key='answer')\n", + "print(results)" + ] + }, + { + "cell_type": "markdown", + "id": "db20cb10-2438-43b2-b9d0-df3ae73d6418", + "metadata": {}, + "source": [ + "**EXERCISE-7 (Do this in Your Groups): Run Evaluation on a Custom Eval Dataset for a RAG Chatbot QA Task**\n", + "1. Create a RAG LLM Chain as we have done before.\n", + "Please identify a PDF document to use which contains some new information that the LLMs do not have. \n", + "Note that it can be a French or English document.\n", + "2. Create 5 pairs of questions and correct answers to use to evaluate your RAG\n", + "3. Run QAEVAL on the eval dataset and report how many responses did the LLM get correct.\n", + "4. Do this again with a different LLM (e.g., Falcon or Mistral) and compare performance across models. *Note that your eval dataset remains the same.*" + ] + }, + { + "cell_type": "markdown", + "id": "20065378-5cd5-4b0f-ae9a-c2869075441a", + "metadata": {}, + "source": [ + "# 7. Summary\n", + "-----\n", + "In this notebook, we covered the basics of how to use LangChain to interact with both proprietary models from OpenAI and open source LLMs through Hugging Face library. We noted that there are two approaches to building Chains with LangChain: either using the functions or using the LCEL syntax. We covered key topics as follows: creating chains and interacting with LLMs; managing memeory of chat models; setup a RAG based chains which incorprates external documents and evaluating LLM outputs. \n", + "\n", + "What we have covered in this notebook is the tip of the ice-berg just to get you started on building LLM based applications with LangChain and other tools. There are alot of other things to learn and check.\n", + "- What are other frameworks whoch perform the same tasks as LangChain?\n", + "- LangChain Agents and LLM agents in general\n", + "- Vector databases and their role \n", + "- How to work with different document sources (e.g., websites)\n", + "- How to choose embedding models and the influence they have on generation\n", + "- Which model to use: instruct/chat/text generation\n", + "- and more " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ab0ee5b1-8133-4405-98fc-e56057daece6", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python3.12-audio", + "language": "python", + "name": "audio" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.2" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/tunisia-may-24/README.md b/notebooks/tunisia-may-24/README.md new file mode 100644 index 0000000..38f8eff --- /dev/null +++ b/notebooks/tunisia-may-24/README.md @@ -0,0 +1,87 @@ +# Programming Activities for the Course + +This document outlines the programming activities for the course, focusing on hands-on projects to apply the concepts learned. The document is organized into two main sections: an introduction to LLM capabilities and LangChain, followed by a practical exercise on deploying a chatbot with Streamlit. + +## LLM Foundations-understanding the ML Process + +## Introducing LLM Capabilities and LangChain + +In this section, we explore the foundational capabilities of Large Language Models (LLMs) and how they can be applied in real-world scenarios. By leveraging LLMs, you can build applications such as chatbots, document analyzers, and automated support systems. + +### Understanding LLM Capabilities +LLMs are capable of generating human-like text, answering questions, summarizing content, and even performing tasks like sentiment analysis and named entity recognition. These models can process and interpret vast amounts of textual data, making them ideal for a variety of applications across domains. + +### Introducing LangChain +LangChain is a powerful framework that simplifies the process of integrating LLMs into applications. It provides modular components and utility functions to create chains (pipelines) that combine different tasks, such as prompting, data processing, and memory management, all within a cohesive system. LangChain makes it easier to build applications that require complex interactions with language models, including: + +- **Prompt Engineering**: Designing effective prompts to achieve desired responses from the LLM. +- **Data Handling**: Loading, processing, and storing large document corpora. +- **Chain Management**: Creating workflows that link multiple steps, such as data loading, prompt generation, and response handling. + +### Building Applications with LangChain +LangChain supports various use cases, including: + +1. **QA Chatbots**: Answering user questions based on specific datasets. +2. **Document Analysis**: Extracting information, summarizing content, or classifying documents. +3. **Automated Support Systems**: Handling customer service or FAQ queries. + +In this course, we will apply these capabilities to build a QA chatbot using LangChain, deploy it on Streamlit, and explore its functionality through real-world examples. + +--- + + +# Deploying a Chatbot on Streamlit +In this activity, you will use the knowledge gained from the LangChain Tutorial to explore a chatbot deployed on Streamlit. You will deploy this app on your computer and interact with it. + +## About Streamlit + +As discussed in the lectures, Streamlit is a platform that enables data scientists to deploy dynamic, data-based apps. It’s ideal for prototyping demonstration apps and sharing them with stakeholders before full-scale production deployment. + +## Initial Setup and Getting the Chatbot Files + +1. **Get OpenAI and Hugging Face API Credentials** + The chatbot uses OpenAI models, so you’ll need to sign up for an OpenAI developer account and obtain an API key. For a step-by-step guide on creating an OpenAI API key, search for instructions on ChatGPT. Similarly, create a Hugging Face account and obtain an API token. + +2. **Try the Chatbot on Streamlit Community Cloud** + Before downloading anything, you can try the chatbot on the Streamlit Community Cloud with just the OpenAI and Hugging Face keys. + +3. **Download or Clone the Project Repository** + To get the project files on your computer, either clone the GitHub repository (if familiar with Git) or download the repository as a zipped file. + +## Deploying the Streamlit App Locally + +1. **Unzip and Navigate to the Project Folder** + Once unzipped, open the project folder and follow the instructions on the GitHub page to deploy the chatbot. + +2. **Follow steps on GitHub project repository**. [Streamlit app repo](https://github.com/worldbank/RAG-Based-ChatBot-Example) + + +3. **Install Required Packages** + The `requirements.txt` file contains a list of all required packages. If you encounter a missing package error, try installing the package again (ensuring your virtual environment is activated). + +4. **Run the App Locally** + Run the app with the following command: + ```bash + streamlit run streamlit_app.py + ``` +5. **Test and Check**. When deployed locally, you can browse the files being used in the app. + +## Explore Important Scripts + +The essential components for building a chatbot with LangChain are organized into distinct, modular Python scripts. Let’s explore some of these elements. You can use VS Code or your preferred text editor for this task. + +### Loading Files +In real-life applications, you may need to load hundreds of documents, requiring a versatile function for file loading. This project includes two types of loaders: +- **`remote_loader.py`**: For loading documents from websites. +- **`local_loader.py`**: For loading documents from the local `data` folder. + +### Document Splitting +The `splitter.py` module uses the `RecursiveCharacterTextSplitter` strategy, with a chunk size of 1000 and an overlap of 0. This method helps in breaking down large documents into manageable sections for processing. + +### Prompt Chains +In the `full_chain.py`, `base_chain.py`, and `rag_chain.py` modules, you’ll find configurations for the specific LLM models and prompting strategies used. The project utilizes OpenAI chat models, with customized chains designed to guide interactions effectively. + +### Memory Management +Memory management strategies are also implemented to optimize the chatbot’s performance, particularly for long interactions or when processing large datasets. + + diff --git a/notebooks/world-bank-api.ipynb b/notebooks/world-bank-api.ipynb deleted file mode 100644 index 7e4bbf0..0000000 --- a/notebooks/world-bank-api.ipynb +++ /dev/null @@ -1,721 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "90700fdc-fcc7-4e54-8c9e-449879d8c66d", - "metadata": { - "tags": [] - }, - "source": [ - "# World Bank Indicators API Example\n", - "\n", - "> The following is an example of a [Jupyter notebook](https://jupyter.org) - a tutorial of how to retrieve data from the [World Bank Indicators API](https://datahelpdesk.worldbank.org/knowledgebase/articles/889392-about-the-indicators-api-documentation) - that illustrates how to use computational content with the [template](https://worldbank.github.io/template). " - ] - }, - { - "cell_type": "markdown", - "id": "e0d992a6-f656-45ce-a025-f824901e8797", - "metadata": {}, - "source": [ - "## Requirements" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "1811080b-c4c6-43cb-9e46-5cfa65d54abf", - "metadata": {}, - "outputs": [], - "source": [ - "import itertools\n", - "\n", - "import pandas\n", - "import requests\n", - "from bokeh.palettes import Spectral6\n", - "from bokeh.plotting import figure, output_notebook, show" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "id": "fb8d2738-535e-4957-b82a-987891955a7f", - "metadata": {}, - "source": [ - "## Data Retrieval\n", - "\n", - "In this example, we retrieve **Population, total** (`SP.POP.TOTL`) from the [World Bank Indicators](https://data.worldbank.org/indicator) for [BRICS](https://infobrics.org)." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "c955864a-1889-4f7f-a29e-108b0534846b", - "metadata": {}, - "outputs": [], - "source": [ - "url = \"https://api.worldbank.org/v2/country/chn;bra;ind;rus;zaf/indicator/SP.POP.TOTL?format=json&per_page=10000\"" - ] - }, - { - "cell_type": "markdown", - "id": "6b5aac7c-bf80-4daa-a4a4-eebe8edc97bb", - "metadata": {}, - "source": [ - "Let's use [requests](https://requests.readthedocs.io) to send a GET request," - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "8d699f28-853a-40a1-8ea9-8dd566962454", - "metadata": {}, - "outputs": [], - "source": [ - "r = requests.get(url)" - ] - }, - { - "cell_type": "markdown", - "id": "a21cf193-ec13-45b8-9726-bb960ac8586a", - "metadata": {}, - "source": [ - "Now, let's normalize and create `pandas.DataFrame` from the response," - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "3dc152b2-95ba-473a-a416-2c4d5bac7622", - "metadata": {}, - "outputs": [], - "source": [ - "# normalize\n", - "data = pandas.json_normalize(r.json()[-1])\n", - "\n", - "# create dataframe\n", - "df = pandas.DataFrame.from_dict(data)" - ] - }, - { - "cell_type": "markdown", - "id": "241904c0-35b9-4e43-a3f9-f97738ea9fd1", - "metadata": {}, - "source": [ - "```{tip}\n", - "Alternatively, the World Bank API supports downloading the data as an [archive](http://api.worldbank.org/v2/country/all/indicator/SP.POP.TOTL?date=2000&source=2&downloadformat=csv). \n", - "```" - ] - }, - { - "cell_type": "markdown", - "id": "bae3462b-f49c-4b8a-badb-8f580b4fc268", - "metadata": {}, - "source": [ - "Let's take a look at the dataframe, " - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "c0bbef2d-495c-4140-b8ac-45ee47772142", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
countryiso3codeBRACHNINDRUSZAF
date
196073.092515667.070445.954579119.89700016.520441
196175.330008660.330456.351876121.23600016.989464
196277.599218665.770467.024193122.59100017.503133
196379.915555682.335477.933619123.96000018.042215
196482.262794698.355489.059309125.34500018.603097
..................
2018210.1665921402.7601369.003306144.47785957.339635
2019211.7828781407.7451383.112050144.40626158.087055
2020213.1963041411.1001396.387127144.07313958.801927
2021214.3262231412.3601407.563842144.13048259.392255
2022215.3134981412.1751417.173173144.23693359.893885
\n", - "

63 rows × 5 columns

\n", - "
" - ], - "text/plain": [ - "countryiso3code BRA CHN IND RUS ZAF\n", - "date \n", - "1960 73.092515 667.070 445.954579 119.897000 16.520441\n", - "1961 75.330008 660.330 456.351876 121.236000 16.989464\n", - "1962 77.599218 665.770 467.024193 122.591000 17.503133\n", - "1963 79.915555 682.335 477.933619 123.960000 18.042215\n", - "1964 82.262794 698.355 489.059309 125.345000 18.603097\n", - "... ... ... ... ... ...\n", - "2018 210.166592 1402.760 1369.003306 144.477859 57.339635\n", - "2019 211.782878 1407.745 1383.112050 144.406261 58.087055\n", - "2020 213.196304 1411.100 1396.387127 144.073139 58.801927\n", - "2021 214.326223 1412.360 1407.563842 144.130482 59.392255\n", - "2022 215.313498 1412.175 1417.173173 144.236933 59.893885\n", - "\n", - "[63 rows x 5 columns]" - ] - }, - "execution_count": 5, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df = df.pivot_table(values=\"value\", index=\"date\", columns=\"countryiso3code\")\n", - "df = df / 1e6 # scaling\n", - "df" - ] - }, - { - "cell_type": "markdown", - "id": "27f26a03-6d7a-4c86-8a02-1ea341d7ac5b", - "metadata": {}, - "source": [ - "## Visualization" - ] - }, - { - "cell_type": "markdown", - "id": "38e8582b-2a51-4908-8356-76cb6158fdc3", - "metadata": {}, - "source": [ - "Let's now plot the data as a time series using [Bokeh](https://docs.bokeh.org)." - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "78041d94-56a6-43ff-a307-8e8a3b377858", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - " \n", - "
\n", - " \n", - " Loading BokehJS ...\n", - "
\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/javascript": [ - "(function(root) {\n", - " function now() {\n", - " return new Date();\n", - " }\n", - "\n", - " const force = true;\n", - "\n", - " if (typeof root._bokeh_onload_callbacks === \"undefined\" || force === true) {\n", - " root._bokeh_onload_callbacks = [];\n", - " root._bokeh_is_loading = undefined;\n", - " }\n", - "\n", - "const JS_MIME_TYPE = 'application/javascript';\n", - " const HTML_MIME_TYPE = 'text/html';\n", - " const EXEC_MIME_TYPE = 'application/vnd.bokehjs_exec.v0+json';\n", - " const CLASS_NAME = 'output_bokeh rendered_html';\n", - "\n", - " /**\n", - " * Render data to the DOM node\n", - " */\n", - " function render(props, node) {\n", - " const script = document.createElement(\"script\");\n", - " node.appendChild(script);\n", - " }\n", - "\n", - " /**\n", - " * Handle when an output is cleared or removed\n", - " */\n", - " function handleClearOutput(event, handle) {\n", - " function drop(id) {\n", - " const view = Bokeh.index.get_by_id(id)\n", - " if (view != null) {\n", - " view.model.document.clear()\n", - " Bokeh.index.delete(view)\n", - " }\n", - " }\n", - "\n", - " const cell = handle.cell;\n", - "\n", - " const id = cell.output_area._bokeh_element_id;\n", - " const server_id = cell.output_area._bokeh_server_id;\n", - "\n", - " // Clean up Bokeh references\n", - " if (id != null) {\n", - " drop(id)\n", - " }\n", - "\n", - " if (server_id !== undefined) {\n", - " // Clean up Bokeh references\n", - " const cmd_clean = \"from bokeh.io.state import curstate; print(curstate().uuid_to_server['\" + server_id + \"'].get_sessions()[0].document.roots[0]._id)\";\n", - " cell.notebook.kernel.execute(cmd_clean, {\n", - " iopub: {\n", - " output: function(msg) {\n", - " const id = msg.content.text.trim()\n", - " drop(id)\n", - " }\n", - " }\n", - " });\n", - " // Destroy server and session\n", - " const cmd_destroy = \"import bokeh.io.notebook as ion; ion.destroy_server('\" + server_id + \"')\";\n", - " cell.notebook.kernel.execute(cmd_destroy);\n", - " }\n", - " }\n", - "\n", - " /**\n", - " * Handle when a new output is added\n", - " */\n", - " function handleAddOutput(event, handle) {\n", - " const output_area = handle.output_area;\n", - " const output = handle.output;\n", - "\n", - " // limit handleAddOutput to display_data with EXEC_MIME_TYPE content only\n", - " if ((output.output_type != \"display_data\") || (!Object.prototype.hasOwnProperty.call(output.data, EXEC_MIME_TYPE))) {\n", - " return\n", - " }\n", - "\n", - " const toinsert = output_area.element.find(\".\" + CLASS_NAME.split(' ')[0]);\n", - "\n", - " if (output.metadata[EXEC_MIME_TYPE][\"id\"] !== undefined) {\n", - " toinsert[toinsert.length - 1].firstChild.textContent = output.data[JS_MIME_TYPE];\n", - " // store reference to embed id on output_area\n", - " output_area._bokeh_element_id = output.metadata[EXEC_MIME_TYPE][\"id\"];\n", - " }\n", - " if (output.metadata[EXEC_MIME_TYPE][\"server_id\"] !== undefined) {\n", - " const bk_div = document.createElement(\"div\");\n", - " bk_div.innerHTML = output.data[HTML_MIME_TYPE];\n", - " const script_attrs = bk_div.children[0].attributes;\n", - " for (let i = 0; i < script_attrs.length; i++) {\n", - " toinsert[toinsert.length - 1].firstChild.setAttribute(script_attrs[i].name, script_attrs[i].value);\n", - " toinsert[toinsert.length - 1].firstChild.textContent = bk_div.children[0].textContent\n", - " }\n", - " // store reference to server id on output_area\n", - " output_area._bokeh_server_id = output.metadata[EXEC_MIME_TYPE][\"server_id\"];\n", - " }\n", - " }\n", - "\n", - " function register_renderer(events, OutputArea) {\n", - "\n", - " function append_mime(data, metadata, element) {\n", - " // create a DOM node to render to\n", - " const toinsert = this.create_output_subarea(\n", - " metadata,\n", - " CLASS_NAME,\n", - " EXEC_MIME_TYPE\n", - " );\n", - " this.keyboard_manager.register_events(toinsert);\n", - " // Render to node\n", - " const props = {data: data, metadata: metadata[EXEC_MIME_TYPE]};\n", - " render(props, toinsert[toinsert.length - 1]);\n", - " element.append(toinsert);\n", - " return toinsert\n", - " }\n", - "\n", - " /* Handle when an output is cleared or removed */\n", - " events.on('clear_output.CodeCell', handleClearOutput);\n", - " events.on('delete.Cell', handleClearOutput);\n", - "\n", - " /* Handle when a new output is added */\n", - " events.on('output_added.OutputArea', handleAddOutput);\n", - "\n", - " /**\n", - " * Register the mime type and append_mime function with output_area\n", - " */\n", - " OutputArea.prototype.register_mime_type(EXEC_MIME_TYPE, append_mime, {\n", - " /* Is output safe? */\n", - " safe: true,\n", - " /* Index of renderer in `output_area.display_order` */\n", - " index: 0\n", - " });\n", - " }\n", - "\n", - " // register the mime type if in Jupyter Notebook environment and previously unregistered\n", - " if (root.Jupyter !== undefined) {\n", - " const events = require('base/js/events');\n", - " const OutputArea = require('notebook/js/outputarea').OutputArea;\n", - "\n", - " if (OutputArea.prototype.mime_types().indexOf(EXEC_MIME_TYPE) == -1) {\n", - " register_renderer(events, OutputArea);\n", - " }\n", - " }\n", - " if (typeof (root._bokeh_timeout) === \"undefined\" || force === true) {\n", - " root._bokeh_timeout = Date.now() + 5000;\n", - " root._bokeh_failed_load = false;\n", - " }\n", - "\n", - " const NB_LOAD_WARNING = {'data': {'text/html':\n", - " \"
\\n\"+\n", - " \"

\\n\"+\n", - " \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n", - " \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n", - " \"

\\n\"+\n", - " \"
    \\n\"+\n", - " \"
  • re-rerun `output_notebook()` to attempt to load from CDN again, or
  • \\n\"+\n", - " \"
  • use INLINE resources instead, as so:
  • \\n\"+\n", - " \"
\\n\"+\n", - " \"\\n\"+\n", - " \"from bokeh.resources import INLINE\\n\"+\n", - " \"output_notebook(resources=INLINE)\\n\"+\n", - " \"\\n\"+\n", - " \"
\"}};\n", - "\n", - " function display_loaded() {\n", - " const el = document.getElementById(\"b627ae3b-1db0-4fe3-8762-32a285a44007\");\n", - " if (el != null) {\n", - " el.textContent = \"BokehJS is loading...\";\n", - " }\n", - " if (root.Bokeh !== undefined) {\n", - " if (el != null) {\n", - " el.textContent = \"BokehJS \" + root.Bokeh.version + \" successfully loaded.\";\n", - " }\n", - " } else if (Date.now() < root._bokeh_timeout) {\n", - " setTimeout(display_loaded, 100)\n", - " }\n", - " }\n", - "\n", - " function run_callbacks() {\n", - " try {\n", - " root._bokeh_onload_callbacks.forEach(function(callback) {\n", - " if (callback != null)\n", - " callback();\n", - " });\n", - " } finally {\n", - " delete root._bokeh_onload_callbacks\n", - " }\n", - " console.debug(\"Bokeh: all callbacks have finished\");\n", - " }\n", - "\n", - " function load_libs(css_urls, js_urls, callback) {\n", - " if (css_urls == null) css_urls = [];\n", - " if (js_urls == null) js_urls = [];\n", - "\n", - " root._bokeh_onload_callbacks.push(callback);\n", - " if (root._bokeh_is_loading > 0) {\n", - " console.debug(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n", - " return null;\n", - " }\n", - " if (js_urls == null || js_urls.length === 0) {\n", - " run_callbacks();\n", - " return null;\n", - " }\n", - " console.debug(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n", - " root._bokeh_is_loading = css_urls.length + js_urls.length;\n", - "\n", - " function on_load() {\n", - " root._bokeh_is_loading--;\n", - " if (root._bokeh_is_loading === 0) {\n", - " console.debug(\"Bokeh: all BokehJS libraries/stylesheets loaded\");\n", - " run_callbacks()\n", - " }\n", - " }\n", - "\n", - " function on_error(url) {\n", - " console.error(\"failed to load \" + url);\n", - " }\n", - "\n", - " for (let i = 0; i < css_urls.length; i++) {\n", - " const url = css_urls[i];\n", - " const element = document.createElement(\"link\");\n", - " element.onload = on_load;\n", - " element.onerror = on_error.bind(null, url);\n", - " element.rel = \"stylesheet\";\n", - " element.type = \"text/css\";\n", - " element.href = url;\n", - " console.debug(\"Bokeh: injecting link tag for BokehJS stylesheet: \", url);\n", - " document.body.appendChild(element);\n", - " }\n", - "\n", - " for (let i = 0; i < js_urls.length; i++) {\n", - " const url = js_urls[i];\n", - " const element = document.createElement('script');\n", - " element.onload = on_load;\n", - " element.onerror = on_error.bind(null, url);\n", - " element.async = false;\n", - " element.src = url;\n", - " console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n", - " document.head.appendChild(element);\n", - " }\n", - " };\n", - "\n", - " function inject_raw_css(css) {\n", - " const element = document.createElement(\"style\");\n", - " element.appendChild(document.createTextNode(css));\n", - " document.body.appendChild(element);\n", - " }\n", - "\n", - " const js_urls = [\"https://cdn.bokeh.org/bokeh/release/bokeh-3.3.4.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-gl-3.3.4.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-widgets-3.3.4.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-tables-3.3.4.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-mathjax-3.3.4.min.js\"];\n", - " const css_urls = [];\n", - "\n", - " const inline_js = [ function(Bokeh) {\n", - " Bokeh.set_log_level(\"info\");\n", - " },\n", - "function(Bokeh) {\n", - " }\n", - " ];\n", - "\n", - " function run_inline_js() {\n", - " if (root.Bokeh !== undefined || force === true) {\n", - " for (let i = 0; i < inline_js.length; i++) {\n", - " inline_js[i].call(root, root.Bokeh);\n", - " }\n", - "if (force === true) {\n", - " display_loaded();\n", - " }} else if (Date.now() < root._bokeh_timeout) {\n", - " setTimeout(run_inline_js, 100);\n", - " } else if (!root._bokeh_failed_load) {\n", - " console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n", - " root._bokeh_failed_load = true;\n", - " } else if (force !== true) {\n", - " const cell = $(document.getElementById(\"b627ae3b-1db0-4fe3-8762-32a285a44007\")).parents('.cell').data().cell;\n", - " cell.output_area.append_execute_result(NB_LOAD_WARNING)\n", - " }\n", - " }\n", - "\n", - " if (root._bokeh_is_loading === 0) {\n", - " console.debug(\"Bokeh: BokehJS loaded, going straight to plotting\");\n", - " run_inline_js();\n", - " } else {\n", - " load_libs(css_urls, js_urls, function() {\n", - " console.debug(\"Bokeh: BokehJS plotting callback run at\", now());\n", - " run_inline_js();\n", - " });\n", - " }\n", - "}(window));" - ], - "application/vnd.bokehjs_load.v0+json": "(function(root) {\n function now() {\n return new Date();\n }\n\n const force = true;\n\n if (typeof root._bokeh_onload_callbacks === \"undefined\" || force === true) {\n root._bokeh_onload_callbacks = [];\n root._bokeh_is_loading = undefined;\n }\n\n\n if (typeof (root._bokeh_timeout) === \"undefined\" || force === true) {\n root._bokeh_timeout = Date.now() + 5000;\n root._bokeh_failed_load = false;\n }\n\n const NB_LOAD_WARNING = {'data': {'text/html':\n \"
\\n\"+\n \"

\\n\"+\n \"BokehJS does not appear to have successfully loaded. If loading BokehJS from CDN, this \\n\"+\n \"may be due to a slow or bad network connection. Possible fixes:\\n\"+\n \"

\\n\"+\n \"
    \\n\"+\n \"
  • re-rerun `output_notebook()` to attempt to load from CDN again, or
  • \\n\"+\n \"
  • use INLINE resources instead, as so:
  • \\n\"+\n \"
\\n\"+\n \"\\n\"+\n \"from bokeh.resources import INLINE\\n\"+\n \"output_notebook(resources=INLINE)\\n\"+\n \"\\n\"+\n \"
\"}};\n\n function display_loaded() {\n const el = document.getElementById(\"b627ae3b-1db0-4fe3-8762-32a285a44007\");\n if (el != null) {\n el.textContent = \"BokehJS is loading...\";\n }\n if (root.Bokeh !== undefined) {\n if (el != null) {\n el.textContent = \"BokehJS \" + root.Bokeh.version + \" successfully loaded.\";\n }\n } else if (Date.now() < root._bokeh_timeout) {\n setTimeout(display_loaded, 100)\n }\n }\n\n function run_callbacks() {\n try {\n root._bokeh_onload_callbacks.forEach(function(callback) {\n if (callback != null)\n callback();\n });\n } finally {\n delete root._bokeh_onload_callbacks\n }\n console.debug(\"Bokeh: all callbacks have finished\");\n }\n\n function load_libs(css_urls, js_urls, callback) {\n if (css_urls == null) css_urls = [];\n if (js_urls == null) js_urls = [];\n\n root._bokeh_onload_callbacks.push(callback);\n if (root._bokeh_is_loading > 0) {\n console.debug(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n return null;\n }\n if (js_urls == null || js_urls.length === 0) {\n run_callbacks();\n return null;\n }\n console.debug(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n root._bokeh_is_loading = css_urls.length + js_urls.length;\n\n function on_load() {\n root._bokeh_is_loading--;\n if (root._bokeh_is_loading === 0) {\n console.debug(\"Bokeh: all BokehJS libraries/stylesheets loaded\");\n run_callbacks()\n }\n }\n\n function on_error(url) {\n console.error(\"failed to load \" + url);\n }\n\n for (let i = 0; i < css_urls.length; i++) {\n const url = css_urls[i];\n const element = document.createElement(\"link\");\n element.onload = on_load;\n element.onerror = on_error.bind(null, url);\n element.rel = \"stylesheet\";\n element.type = \"text/css\";\n element.href = url;\n console.debug(\"Bokeh: injecting link tag for BokehJS stylesheet: \", url);\n document.body.appendChild(element);\n }\n\n for (let i = 0; i < js_urls.length; i++) {\n const url = js_urls[i];\n const element = document.createElement('script');\n element.onload = on_load;\n element.onerror = on_error.bind(null, url);\n element.async = false;\n element.src = url;\n console.debug(\"Bokeh: injecting script tag for BokehJS library: \", url);\n document.head.appendChild(element);\n }\n };\n\n function inject_raw_css(css) {\n const element = document.createElement(\"style\");\n element.appendChild(document.createTextNode(css));\n document.body.appendChild(element);\n }\n\n const js_urls = [\"https://cdn.bokeh.org/bokeh/release/bokeh-3.3.4.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-gl-3.3.4.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-widgets-3.3.4.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-tables-3.3.4.min.js\", \"https://cdn.bokeh.org/bokeh/release/bokeh-mathjax-3.3.4.min.js\"];\n const css_urls = [];\n\n const inline_js = [ function(Bokeh) {\n Bokeh.set_log_level(\"info\");\n },\nfunction(Bokeh) {\n }\n ];\n\n function run_inline_js() {\n if (root.Bokeh !== undefined || force === true) {\n for (let i = 0; i < inline_js.length; i++) {\n inline_js[i].call(root, root.Bokeh);\n }\nif (force === true) {\n display_loaded();\n }} else if (Date.now() < root._bokeh_timeout) {\n setTimeout(run_inline_js, 100);\n } else if (!root._bokeh_failed_load) {\n console.log(\"Bokeh: BokehJS failed to load within specified timeout.\");\n root._bokeh_failed_load = true;\n } else if (force !== true) {\n const cell = $(document.getElementById(\"b627ae3b-1db0-4fe3-8762-32a285a44007\")).parents('.cell').data().cell;\n cell.output_area.append_execute_result(NB_LOAD_WARNING)\n }\n }\n\n if (root._bokeh_is_loading === 0) {\n console.debug(\"Bokeh: BokehJS loaded, going straight to plotting\");\n run_inline_js();\n } else {\n load_libs(css_urls, js_urls, function() {\n console.debug(\"Bokeh: BokehJS plotting callback run at\", now());\n run_inline_js();\n });\n }\n}(window));" - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "\n", - "
\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/javascript": [ - "(function(root) {\n", - " function embed_document(root) {\n", - " const docs_json = {\"6c86f57c-0f70-4648-a27e-ed2b73c3308b\":{\"version\":\"3.3.4\",\"title\":\"Bokeh Application\",\"roots\":[{\"type\":\"object\",\"name\":\"Figure\",\"id\":\"p1001\",\"attributes\":{\"width\":700,\"x_range\":{\"type\":\"object\",\"name\":\"DataRange1d\",\"id\":\"p1002\"},\"y_range\":{\"type\":\"object\",\"name\":\"DataRange1d\",\"id\":\"p1003\"},\"x_scale\":{\"type\":\"object\",\"name\":\"LinearScale\",\"id\":\"p1011\"},\"y_scale\":{\"type\":\"object\",\"name\":\"LinearScale\",\"id\":\"p1012\"},\"title\":{\"type\":\"object\",\"name\":\"Title\",\"id\":\"p1004\",\"attributes\":{\"text\":\"Population, total (World Bank)\",\"text_font_size\":\"12pt\"}},\"renderers\":[{\"type\":\"object\",\"name\":\"GlyphRenderer\",\"id\":\"p1040\",\"attributes\":{\"data_source\":{\"type\":\"object\",\"name\":\"ColumnDataSource\",\"id\":\"p1034\",\"attributes\":{\"selected\":{\"type\":\"object\",\"name\":\"Selection\",\"id\":\"p1035\",\"attributes\":{\"indices\":[],\"line_indices\":[]}},\"selection_policy\":{\"type\":\"object\",\"name\":\"UnionRenderers\",\"id\":\"p1036\"},\"data\":{\"type\":\"map\",\"entries\":[[\"x\",{\"type\":\"ndarray\",\"array\":[\"1960\",\"1961\",\"1962\",\"1963\",\"1964\",\"1965\",\"1966\",\"1967\",\"1968\",\"1969\",\"1970\",\"1971\",\"1972\",\"1973\",\"1974\",\"1975\",\"1976\",\"1977\",\"1978\",\"1979\",\"1980\",\"1981\",\"1982\",\"1983\",\"1984\",\"1985\",\"1986\",\"1987\",\"1988\",\"1989\",\"1990\",\"1991\",\"1992\",\"1993\",\"1994\",\"1995\",\"1996\",\"1997\",\"1998\",\"1999\",\"2000\",\"2001\",\"2002\",\"2003\",\"2004\",\"2005\",\"2006\",\"2007\",\"2008\",\"2009\",\"2010\",\"2011\",\"2012\",\"2013\",\"2014\",\"2015\",\"2016\",\"2017\",\"2018\",\"2019\",\"2020\",\"2021\",\"2022\"],\"shape\":[63],\"dtype\":\"object\",\"order\":\"little\"}],[\"y\",{\"type\":\"ndarray\",\"array\":{\"type\":\"bytes\",\"data\":\"7dgIxOtFUkDH2t/ZHtVSQCtLdJZZZlNAHaz/c5j6U0Bx5eyd0ZBUQJl+iXjrJ1VA16Gakqy+VUAdzCbAsFRWQDSBIhYx6lZAjKIHPgaAV0DpJjEIrBdYQE7U0twKsVhAeSKI83BMWUCVZB2OrupZQAiRDDm2ilpA4NbdPNUsW0AVi98UVtJbQPj9mxcnfFxARYMUPIUqXUDGGcOcoNxdQDtu+N10kl5AAvG6fsFKX0BDBBxCFQJgQO5Cc51GX2BAFNBE2HC8YEAz3IDPDxlhQPuWOV2WdGFAq+l6ouvOYUAlIvyLIChiQEuuYvEbgGJAe0ykNJvWYkDj/E0oxCpjQAvSjEXTfGNA+FPjpRvOY0CZ1NAGYB9kQNJWJZF9cGRAyv55GjDBZEDRr62ffhFlQADICRNGYWVAUfUrnY+vZUCOO6WD9ftlQJq0qbrHRmZAza/mAEGPZkD0wp0LI9RmQMnp6/kaF2dApb+XwoNZZ0BPzeUGQ5pnQJz4akfx2GdAb/HwnoMVaEB7ouvCj1BoQLlsdM5Pi2hARbx1/u3FaEBAwjBgSf9oQKCKG7cYN2lAaLPqc7VuaUAz/n3GBaZpQI7LuKmB22lAObnfoSgQakDP+L64VEVqQEHYKVYNeWpAXoJTH0imakB/hjdrcMpqQJEr9SwI6mpA\"},\"shape\":[63],\"dtype\":\"float64\",\"order\":\"little\"}]]}}},\"view\":{\"type\":\"object\",\"name\":\"CDSView\",\"id\":\"p1041\",\"attributes\":{\"filter\":{\"type\":\"object\",\"name\":\"AllIndices\",\"id\":\"p1042\"}}},\"glyph\":{\"type\":\"object\",\"name\":\"Line\",\"id\":\"p1037\",\"attributes\":{\"x\":{\"type\":\"field\",\"field\":\"x\"},\"y\":{\"type\":\"field\",\"field\":\"y\"},\"line_color\":\"#3288bd\",\"line_width\":2}},\"nonselection_glyph\":{\"type\":\"object\",\"name\":\"Line\",\"id\":\"p1038\",\"attributes\":{\"x\":{\"type\":\"field\",\"field\":\"x\"},\"y\":{\"type\":\"field\",\"field\":\"y\"},\"line_color\":\"#3288bd\",\"line_alpha\":0.1,\"line_width\":2}},\"muted_glyph\":{\"type\":\"object\",\"name\":\"Line\",\"id\":\"p1039\",\"attributes\":{\"x\":{\"type\":\"field\",\"field\":\"x\"},\"y\":{\"type\":\"field\",\"field\":\"y\"},\"line_color\":\"#3288bd\",\"line_alpha\":0.2,\"line_width\":2}}}},{\"type\":\"object\",\"name\":\"GlyphRenderer\",\"id\":\"p1051\",\"attributes\":{\"data_source\":{\"type\":\"object\",\"name\":\"ColumnDataSource\",\"id\":\"p1045\",\"attributes\":{\"selected\":{\"type\":\"object\",\"name\":\"Selection\",\"id\":\"p1046\",\"attributes\":{\"indices\":[],\"line_indices\":[]}},\"selection_policy\":{\"type\":\"object\",\"name\":\"UnionRenderers\",\"id\":\"p1047\"},\"data\":{\"type\":\"map\",\"entries\":[[\"x\",{\"type\":\"ndarray\",\"array\":[\"1960\",\"1961\",\"1962\",\"1963\",\"1964\",\"1965\",\"1966\",\"1967\",\"1968\",\"1969\",\"1970\",\"1971\",\"1972\",\"1973\",\"1974\",\"1975\",\"1976\",\"1977\",\"1978\",\"1979\",\"1980\",\"1981\",\"1982\",\"1983\",\"1984\",\"1985\",\"1986\",\"1987\",\"1988\",\"1989\",\"1990\",\"1991\",\"1992\",\"1993\",\"1994\",\"1995\",\"1996\",\"1997\",\"1998\",\"1999\",\"2000\",\"2001\",\"2002\",\"2003\",\"2004\",\"2005\",\"2006\",\"2007\",\"2008\",\"2009\",\"2010\",\"2011\",\"2012\",\"2013\",\"2014\",\"2015\",\"2016\",\"2017\",\"2018\",\"2019\",\"2020\",\"2021\",\"2022\"],\"shape\":[63],\"dtype\":\"object\",\"order\":\"little\"}],[\"y\",{\"type\":\"ndarray\",\"array\":{\"type\":\"bytes\",\"data\":\"w/UoXI/YhEBxPQrXo6KEQFyPwvUozoRASOF6FK5ShUCkcD0K19KFQBSuR+F6WYZAMzMzMzP7hkBmZmZmZpSHQK5H4XoUNIhAMzMzMzPgiEDsUbgehZKJQKRwPQrXSIpACtejcD3wikDsUbgehY+LQM3MzMzMIoxAXI/C9SijjEAUrkfhehWNQHE9Cteje41AuB6F61HhjUDXo3A9CkiOQHsUrkfhqY5ArkfhehQPj0DXo3A9CoWPQBSuR+F6+o9AzczMzEwzkEBcj8L1KGyQQFyPwvUoq5BAcT0K1yPwkEDsUbgehTaRQJqZmZmZepFACtejcL28kUCF61G4HvuRQHsUrkfhM5JA9ihcj8JpkkCkcD0KV5+SQFK4HoVr05JAMzMzMzMGk0DNzMzMTDiTQArXo3C9Z5NAPQrXo/CSk0CuR+F6lLqTQGZmZmZm35NAmpmZmZkBlECamZmZmSGUQM3MzMxMQJRAexSuR+FelECuR+F6FHyUQNejcD2Kl5RAhetRuJ6ylEDXo3A9Cs2UQLgehevR5pRAcT0K1yMElUD2KFyPwiiVQClcj8L1TJVAPQrXo3BvlUA9CtejcI+VQFyPwvUor5VAj8L1KNzQlUDXo3A9CuuVQBSuR+H6/pVAZmZmZmYMlkA9CtejcBGWQDMzMzOzEJZA\"},\"shape\":[63],\"dtype\":\"float64\",\"order\":\"little\"}]]}}},\"view\":{\"type\":\"object\",\"name\":\"CDSView\",\"id\":\"p1052\",\"attributes\":{\"filter\":{\"type\":\"object\",\"name\":\"AllIndices\",\"id\":\"p1053\"}}},\"glyph\":{\"type\":\"object\",\"name\":\"Line\",\"id\":\"p1048\",\"attributes\":{\"x\":{\"type\":\"field\",\"field\":\"x\"},\"y\":{\"type\":\"field\",\"field\":\"y\"},\"line_color\":\"#99d594\",\"line_width\":2}},\"nonselection_glyph\":{\"type\":\"object\",\"name\":\"Line\",\"id\":\"p1049\",\"attributes\":{\"x\":{\"type\":\"field\",\"field\":\"x\"},\"y\":{\"type\":\"field\",\"field\":\"y\"},\"line_color\":\"#99d594\",\"line_alpha\":0.1,\"line_width\":2}},\"muted_glyph\":{\"type\":\"object\",\"name\":\"Line\",\"id\":\"p1050\",\"attributes\":{\"x\":{\"type\":\"field\",\"field\":\"x\"},\"y\":{\"type\":\"field\",\"field\":\"y\"},\"line_color\":\"#99d594\",\"line_alpha\":0.2,\"line_width\":2}}}},{\"type\":\"object\",\"name\":\"GlyphRenderer\",\"id\":\"p1061\",\"attributes\":{\"data_source\":{\"type\":\"object\",\"name\":\"ColumnDataSource\",\"id\":\"p1055\",\"attributes\":{\"selected\":{\"type\":\"object\",\"name\":\"Selection\",\"id\":\"p1056\",\"attributes\":{\"indices\":[],\"line_indices\":[]}},\"selection_policy\":{\"type\":\"object\",\"name\":\"UnionRenderers\",\"id\":\"p1057\"},\"data\":{\"type\":\"map\",\"entries\":[[\"x\",{\"type\":\"ndarray\",\"array\":[\"1960\",\"1961\",\"1962\",\"1963\",\"1964\",\"1965\",\"1966\",\"1967\",\"1968\",\"1969\",\"1970\",\"1971\",\"1972\",\"1973\",\"1974\",\"1975\",\"1976\",\"1977\",\"1978\",\"1979\",\"1980\",\"1981\",\"1982\",\"1983\",\"1984\",\"1985\",\"1986\",\"1987\",\"1988\",\"1989\",\"1990\",\"1991\",\"1992\",\"1993\",\"1994\",\"1995\",\"1996\",\"1997\",\"1998\",\"1999\",\"2000\",\"2001\",\"2002\",\"2003\",\"2004\",\"2005\",\"2006\",\"2007\",\"2008\",\"2009\",\"2010\",\"2011\",\"2012\",\"2013\",\"2014\",\"2015\",\"2016\",\"2017\",\"2018\",\"2019\",\"2020\",\"2021\",\"2022\"],\"shape\":[63],\"dtype\":\"object\",\"order\":\"little\"}],[\"y\",{\"type\":\"ndarray\",\"array\":{\"type\":\"bytes\",\"data\":\"LSeh9EXfe0D1g7pIoYV8QK38MhhjMH1Ayv55GvDefUC8df7t8pB+QNumeFzUQX9AQgddwuHvf0Aqj26E5U+AQMmutIx0q4BAQni0cYQKgUCeQxmqAmyBQBAHCVH+z4FAcy8wK7Q2gkACDwwg3KCCQITyPo7GDYNAiSe7mTF8g0DC3sSQnOuDQMZpiCp8XYRAatlaXyTShEAP7zmw/EmFQPqbUIigxoVA04OCUvRGhkBGfv0QW8mGQB41JsScTodAjIaMRynXh0AUd7zJ72GIQE5jey0I74hAyXa+n7p9iUAa/P1i1g2KQFZETfQZoIpAK/uuCJ4zi0B/pl63iMeLQGmKAKeXXIxAdzHNdM/yjEBol299GIqNQOwy/Kc7Io5AXwg57z+6jkAb9RCNrlKPQANd+wJ6649A8OAnDgBCkEAsZRniiI6QQNTRcTXi25BAfa1LjUApkUBF8wAWqXWRQE+Q2O4OwZFAH9rHCo4KkkCNDkjCflGSQFAYlGnElpJAnuv7cPDakkDUYBqGjx6TQLAgzVh0YpNAR1Z+GXymk0B8LH3o8umTQAzohTuHLJRArvTabPxslEBIMxZNd6uUQIyEtpyL6pRA7YFWYMgolUCG56ViA2SVQBE2PL1ynJVAmDEFa4zRlUByGMxfQf6VQDNOQ1SxJJZA\"},\"shape\":[63],\"dtype\":\"float64\",\"order\":\"little\"}]]}}},\"view\":{\"type\":\"object\",\"name\":\"CDSView\",\"id\":\"p1062\",\"attributes\":{\"filter\":{\"type\":\"object\",\"name\":\"AllIndices\",\"id\":\"p1063\"}}},\"glyph\":{\"type\":\"object\",\"name\":\"Line\",\"id\":\"p1058\",\"attributes\":{\"x\":{\"type\":\"field\",\"field\":\"x\"},\"y\":{\"type\":\"field\",\"field\":\"y\"},\"line_color\":\"#e6f598\",\"line_width\":2}},\"nonselection_glyph\":{\"type\":\"object\",\"name\":\"Line\",\"id\":\"p1059\",\"attributes\":{\"x\":{\"type\":\"field\",\"field\":\"x\"},\"y\":{\"type\":\"field\",\"field\":\"y\"},\"line_color\":\"#e6f598\",\"line_alpha\":0.1,\"line_width\":2}},\"muted_glyph\":{\"type\":\"object\",\"name\":\"Line\",\"id\":\"p1060\",\"attributes\":{\"x\":{\"type\":\"field\",\"field\":\"x\"},\"y\":{\"type\":\"field\",\"field\":\"y\"},\"line_color\":\"#e6f598\",\"line_alpha\":0.2,\"line_width\":2}}}},{\"type\":\"object\",\"name\":\"GlyphRenderer\",\"id\":\"p1071\",\"attributes\":{\"data_source\":{\"type\":\"object\",\"name\":\"ColumnDataSource\",\"id\":\"p1065\",\"attributes\":{\"selected\":{\"type\":\"object\",\"name\":\"Selection\",\"id\":\"p1066\",\"attributes\":{\"indices\":[],\"line_indices\":[]}},\"selection_policy\":{\"type\":\"object\",\"name\":\"UnionRenderers\",\"id\":\"p1067\"},\"data\":{\"type\":\"map\",\"entries\":[[\"x\",{\"type\":\"ndarray\",\"array\":[\"1960\",\"1961\",\"1962\",\"1963\",\"1964\",\"1965\",\"1966\",\"1967\",\"1968\",\"1969\",\"1970\",\"1971\",\"1972\",\"1973\",\"1974\",\"1975\",\"1976\",\"1977\",\"1978\",\"1979\",\"1980\",\"1981\",\"1982\",\"1983\",\"1984\",\"1985\",\"1986\",\"1987\",\"1988\",\"1989\",\"1990\",\"1991\",\"1992\",\"1993\",\"1994\",\"1995\",\"1996\",\"1997\",\"1998\",\"1999\",\"2000\",\"2001\",\"2002\",\"2003\",\"2004\",\"2005\",\"2006\",\"2007\",\"2008\",\"2009\",\"2010\",\"2011\",\"2012\",\"2013\",\"2014\",\"2015\",\"2016\",\"2017\",\"2018\",\"2019\",\"2020\",\"2021\",\"2022\"],\"shape\":[63],\"dtype\":\"object\",\"order\":\"little\"}],[\"y\",{\"type\":\"ndarray\",\"array\":{\"type\":\"bytes\",\"data\":\"xSCwcmj5XUDJdr6fGk9eQOf7qfHSpV5APQrXo3D9XkCuR+F6FFZfQEjhehSur19Ay6FFtvPdX0CDwMqhRQZgQARWDi2yHWBAaJHtfD81YECwcmiR7UxgQClcj8L1ZGBADAIrhxZ9YEDFILByaJVgQOf7qfHSrWBAZmZmZmbGYEBiEFg5tORgQDMzMzMzA2FAUrgeheshYUC+nxov3UBhQLgehetRYGFAJzEIrBx+YUB1kxgEVpphQEw3iUFgtWFApHA9CtfXYUD6fmq8dPthQPhT46WbHGJAkxgEVg49YkCBlUOLbFtiQFCNl24Sd2JAbjDUYQV/YkDrcd9qnYxiQMU56ug4kWJABmUaTa6OYkCsdHedDY1iQGJodXIGjGJAhUTaxh+FYkAV4SajSn1iQMPVARB3dWJA5/7qcd9mYkDhXwSNGVNiQD/kLVc/P2JAT+rL0s4pYkAaM4l6wRRiQOxP4nMnAmJAn1bRH5rwYUB9dVWgluFhQKhxb37D2WFAMnVXdsHXYUBI3jmUIdlhQPTfg9cu22FA304iwr/eYUBTPZl/dOZhQIkHlE058GFAidNJtjr6YUDVPh2PGQNiQPSnjer0CmJAuvQvSeUPYkAtI/WeSg9iQH2UERcADWJA/aGZJ1cCYkDvVpboLARiQPvKg/SUB2JA\"},\"shape\":[63],\"dtype\":\"float64\",\"order\":\"little\"}]]}}},\"view\":{\"type\":\"object\",\"name\":\"CDSView\",\"id\":\"p1072\",\"attributes\":{\"filter\":{\"type\":\"object\",\"name\":\"AllIndices\",\"id\":\"p1073\"}}},\"glyph\":{\"type\":\"object\",\"name\":\"Line\",\"id\":\"p1068\",\"attributes\":{\"x\":{\"type\":\"field\",\"field\":\"x\"},\"y\":{\"type\":\"field\",\"field\":\"y\"},\"line_color\":\"#fee08b\",\"line_width\":2}},\"nonselection_glyph\":{\"type\":\"object\",\"name\":\"Line\",\"id\":\"p1069\",\"attributes\":{\"x\":{\"type\":\"field\",\"field\":\"x\"},\"y\":{\"type\":\"field\",\"field\":\"y\"},\"line_color\":\"#fee08b\",\"line_alpha\":0.1,\"line_width\":2}},\"muted_glyph\":{\"type\":\"object\",\"name\":\"Line\",\"id\":\"p1070\",\"attributes\":{\"x\":{\"type\":\"field\",\"field\":\"x\"},\"y\":{\"type\":\"field\",\"field\":\"y\"},\"line_color\":\"#fee08b\",\"line_alpha\":0.2,\"line_width\":2}}}},{\"type\":\"object\",\"name\":\"GlyphRenderer\",\"id\":\"p1081\",\"attributes\":{\"data_source\":{\"type\":\"object\",\"name\":\"ColumnDataSource\",\"id\":\"p1075\",\"attributes\":{\"selected\":{\"type\":\"object\",\"name\":\"Selection\",\"id\":\"p1076\",\"attributes\":{\"indices\":[],\"line_indices\":[]}},\"selection_policy\":{\"type\":\"object\",\"name\":\"UnionRenderers\",\"id\":\"p1077\"},\"data\":{\"type\":\"map\",\"entries\":[[\"x\",{\"type\":\"ndarray\",\"array\":[\"1960\",\"1961\",\"1962\",\"1963\",\"1964\",\"1965\",\"1966\",\"1967\",\"1968\",\"1969\",\"1970\",\"1971\",\"1972\",\"1973\",\"1974\",\"1975\",\"1976\",\"1977\",\"1978\",\"1979\",\"1980\",\"1981\",\"1982\",\"1983\",\"1984\",\"1985\",\"1986\",\"1987\",\"1988\",\"1989\",\"1990\",\"1991\",\"1992\",\"1993\",\"1994\",\"1995\",\"1996\",\"1997\",\"1998\",\"1999\",\"2000\",\"2001\",\"2002\",\"2003\",\"2004\",\"2005\",\"2006\",\"2007\",\"2008\",\"2009\",\"2010\",\"2011\",\"2012\",\"2013\",\"2014\",\"2015\",\"2016\",\"2017\",\"2018\",\"2019\",\"2020\",\"2021\",\"2022\"],\"shape\":[63],\"dtype\":\"object\",\"order\":\"little\"}],[\"y\",{\"type\":\"ndarray\",\"array\":{\"type\":\"bytes\",\"data\":\"X38SnzuFMEDBkUCDTf0wQNKJBFPNgDFAkGYsms4KMkDTUKOQZJoyQOcBLPLrLzNAR1Sobi7KM0AqOLwgImk0QM0jfzDwDDVApmJjXke0NUCr61BNSV42QOjAcoQMCDdA7gbRWtGyN0B7Szlf7GE4QMxEEVK3EzlAIXcRpijHOUDQRNjw9Ho6QIB/SpUoMztAIZOMnIXxO0Cp2m6Cb7I8QPAXsyWrdj1AN1MhHok7PkDb39kevQU/QLtIoSx83T9Adcdim1RiQEBdiNUfYeBAQI+oUN1ccEFAsirCTUYPQkA7N23GabJCQOKt82+XVUNAsfm4NlTwQ0CZf/RNmnREQF6iemtg4URAUAEwnkFDRUBW9fI7TaJFQO+NIQA4/kVAvvc3aK9URkAm5e5zfKRGQK5hhsYT7UZAfTz03a0uR0BweawZGWhHQHi3skRnnUdATuyhfazUR0Brm+JxUQ1IQISgo1UtR0hAvRx23zGCSEBPzlDc8b5IQPTeGAKA/0hAmdcRh2xISUD76xUW3JVJQMh4lEp45ElA5j+k3744SkCe6/twkJJKQJShKqbS70pAzuFa7WFdS0AWaHdIMfBLQL75DRMNNkxAUHPyIhNSTEDayeAoeatMQAX6RJ4kC01Aw2M/i6VmTUCjWG5pNbJNQEax3NJq8k1A\"},\"shape\":[63],\"dtype\":\"float64\",\"order\":\"little\"}]]}}},\"view\":{\"type\":\"object\",\"name\":\"CDSView\",\"id\":\"p1082\",\"attributes\":{\"filter\":{\"type\":\"object\",\"name\":\"AllIndices\",\"id\":\"p1083\"}}},\"glyph\":{\"type\":\"object\",\"name\":\"Line\",\"id\":\"p1078\",\"attributes\":{\"x\":{\"type\":\"field\",\"field\":\"x\"},\"y\":{\"type\":\"field\",\"field\":\"y\"},\"line_color\":\"#fc8d59\",\"line_width\":2}},\"nonselection_glyph\":{\"type\":\"object\",\"name\":\"Line\",\"id\":\"p1079\",\"attributes\":{\"x\":{\"type\":\"field\",\"field\":\"x\"},\"y\":{\"type\":\"field\",\"field\":\"y\"},\"line_color\":\"#fc8d59\",\"line_alpha\":0.1,\"line_width\":2}},\"muted_glyph\":{\"type\":\"object\",\"name\":\"Line\",\"id\":\"p1080\",\"attributes\":{\"x\":{\"type\":\"field\",\"field\":\"x\"},\"y\":{\"type\":\"field\",\"field\":\"y\"},\"line_color\":\"#fc8d59\",\"line_alpha\":0.2,\"line_width\":2}}}}],\"toolbar\":{\"type\":\"object\",\"name\":\"Toolbar\",\"id\":\"p1010\",\"attributes\":{\"tools\":[{\"type\":\"object\",\"name\":\"PanTool\",\"id\":\"p1023\"},{\"type\":\"object\",\"name\":\"WheelZoomTool\",\"id\":\"p1024\",\"attributes\":{\"renderers\":\"auto\"}},{\"type\":\"object\",\"name\":\"BoxZoomTool\",\"id\":\"p1025\",\"attributes\":{\"overlay\":{\"type\":\"object\",\"name\":\"BoxAnnotation\",\"id\":\"p1026\",\"attributes\":{\"syncable\":false,\"level\":\"overlay\",\"visible\":false,\"left\":{\"type\":\"number\",\"value\":\"nan\"},\"right\":{\"type\":\"number\",\"value\":\"nan\"},\"top\":{\"type\":\"number\",\"value\":\"nan\"},\"bottom\":{\"type\":\"number\",\"value\":\"nan\"},\"left_units\":\"canvas\",\"right_units\":\"canvas\",\"top_units\":\"canvas\",\"bottom_units\":\"canvas\",\"line_color\":\"black\",\"line_alpha\":1.0,\"line_width\":2,\"line_dash\":[4,4],\"fill_color\":\"lightgrey\",\"fill_alpha\":0.5}}}},{\"type\":\"object\",\"name\":\"SaveTool\",\"id\":\"p1031\"},{\"type\":\"object\",\"name\":\"ResetTool\",\"id\":\"p1032\"},{\"type\":\"object\",\"name\":\"HelpTool\",\"id\":\"p1033\"}]}},\"left\":[{\"type\":\"object\",\"name\":\"LinearAxis\",\"id\":\"p1018\",\"attributes\":{\"ticker\":{\"type\":\"object\",\"name\":\"BasicTicker\",\"id\":\"p1019\",\"attributes\":{\"mantissas\":[1,2,5]}},\"formatter\":{\"type\":\"object\",\"name\":\"BasicTickFormatter\",\"id\":\"p1020\"},\"axis_label\":\"Population, total (in millions)\",\"major_label_policy\":{\"type\":\"object\",\"name\":\"AllLabels\",\"id\":\"p1021\"}}}],\"below\":[{\"type\":\"object\",\"name\":\"LinearAxis\",\"id\":\"p1013\",\"attributes\":{\"ticker\":{\"type\":\"object\",\"name\":\"BasicTicker\",\"id\":\"p1014\",\"attributes\":{\"mantissas\":[1,2,5]}},\"formatter\":{\"type\":\"object\",\"name\":\"BasicTickFormatter\",\"id\":\"p1015\"},\"axis_label\":\"Year\",\"major_label_policy\":{\"type\":\"object\",\"name\":\"AllLabels\",\"id\":\"p1016\"}}}],\"center\":[{\"type\":\"object\",\"name\":\"Grid\",\"id\":\"p1017\",\"attributes\":{\"axis\":{\"id\":\"p1013\"}}},{\"type\":\"object\",\"name\":\"Grid\",\"id\":\"p1022\",\"attributes\":{\"dimension\":1,\"axis\":{\"id\":\"p1018\"}}},{\"type\":\"object\",\"name\":\"Legend\",\"id\":\"p1043\",\"attributes\":{\"location\":\"right\",\"click_policy\":\"mute\",\"items\":[{\"type\":\"object\",\"name\":\"LegendItem\",\"id\":\"p1044\",\"attributes\":{\"label\":{\"type\":\"value\",\"value\":\"BRA\"},\"renderers\":[{\"id\":\"p1040\"}]}},{\"type\":\"object\",\"name\":\"LegendItem\",\"id\":\"p1054\",\"attributes\":{\"label\":{\"type\":\"value\",\"value\":\"CHN\"},\"renderers\":[{\"id\":\"p1051\"}]}},{\"type\":\"object\",\"name\":\"LegendItem\",\"id\":\"p1064\",\"attributes\":{\"label\":{\"type\":\"value\",\"value\":\"IND\"},\"renderers\":[{\"id\":\"p1061\"}]}},{\"type\":\"object\",\"name\":\"LegendItem\",\"id\":\"p1074\",\"attributes\":{\"label\":{\"type\":\"value\",\"value\":\"RUS\"},\"renderers\":[{\"id\":\"p1071\"}]}},{\"type\":\"object\",\"name\":\"LegendItem\",\"id\":\"p1084\",\"attributes\":{\"label\":{\"type\":\"value\",\"value\":\"ZAF\"},\"renderers\":[{\"id\":\"p1081\"}]}}]}}]}}]}};\n", - " const render_items = [{\"docid\":\"6c86f57c-0f70-4648-a27e-ed2b73c3308b\",\"roots\":{\"p1001\":\"d2127207-0be3-46fb-86fa-2b57704ab0fc\"},\"root_ids\":[\"p1001\"]}];\n", - " root.Bokeh.embed.embed_items_notebook(docs_json, render_items);\n", - " }\n", - " if (root.Bokeh !== undefined) {\n", - " embed_document(root);\n", - " } else {\n", - " let attempts = 0;\n", - " const timer = setInterval(function(root) {\n", - " if (root.Bokeh !== undefined) {\n", - " clearInterval(timer);\n", - " embed_document(root);\n", - " } else {\n", - " attempts++;\n", - " if (attempts > 100) {\n", - " clearInterval(timer);\n", - " console.log(\"Bokeh: ERROR: Unable to run BokehJS code because BokehJS library is missing\");\n", - " }\n", - " }\n", - " }, 10, root)\n", - " }\n", - "})(window);" - ], - "application/vnd.bokehjs_exec.v0+json": "" - }, - "metadata": { - "application/vnd.bokehjs_exec.v0+json": { - "id": "p1001" - } - }, - "output_type": "display_data" - } - ], - "source": [ - "output_notebook()\n", - "\n", - "p = figure(title=\"Population, total (World Bank)\", width=700, height=600)\n", - "\n", - "# colors\n", - "colors = itertools.cycle(Spectral6)\n", - "\n", - "# plotting the line graph\n", - "for column, color in zip(df.columns, colors):\n", - " p.line(\n", - " df.index,\n", - " df[column],\n", - " legend_label=column,\n", - " color=color,\n", - " line_width=2,\n", - " )\n", - "\n", - "p.legend.location = \"right\"\n", - "p.legend.click_policy = \"mute\"\n", - "p.title.text_font_size = \"12pt\"\n", - "\n", - "p.xaxis.axis_label = \"Year\"\n", - "p.yaxis.axis_label = \"Population, total (in millions)\"\n", - "\n", - "show(p)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" - }, - "vscode": { - "interpreter": { - "hash": "b6702b69e93007336b96338c5a331192f07cedff01d36d4dcfa0f842adb718ad" - } - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/notebooks/world-bank-package.ipynb b/notebooks/world-bank-package.ipynb deleted file mode 100644 index 6883570..0000000 --- a/notebooks/world-bank-package.ipynb +++ /dev/null @@ -1,281 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "90700fdc-fcc7-4e54-8c9e-449879d8c66d", - "metadata": { - "tags": [] - }, - "source": [ - "# Python Package Example\n", - "\n", - "> The following is an example of on how to use and distribute your project as a [Python package](https://packaging.python.org) using the example template. Remember mix and match to yout project's requirements. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ef92b033-81e2-4c5f-b56a-63f4f7a37247", - "metadata": { - "editable": true, - "slideshow": { - "slide_type": "" - }, - "tags": [ - "remove-cell" - ] - }, - "outputs": [], - "source": [ - "import itertools\n", - "\n", - "from bokeh.palettes import Spectral6\n", - "from bokeh.plotting import figure, output_notebook, show" - ] - }, - { - "cell_type": "markdown", - "id": "14e89727", - "metadata": {}, - "source": [ - "## Usage" - ] - }, - { - "cell_type": "markdown", - "id": "b4c0f3e8-7756-41bb-aa21-cc2eee5ff67f", - "metadata": {}, - "source": [ - "Unlike the [previous example](https://worldbank.github.io/template/notebooks/world-bank-api.html), where the source code was contained on the Jupyter notebook itself, we (re)use a Python package - the [template](https://github.com/worldbank/template/tree/main/src/template) Python package - which will let us (re)use any attributes and methods in the following example.\n", - "\n", - "Let's start by importing `WorldBankIndicatorsAPI`, a Python API wrapper class created to facilitate the usage of the [World Bank Indicators API](https://datahelpdesk.worldbank.org/knowledgebase/articles/889392-about-the-indicators-api-documentation)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d797ef77-6ca4-4f9d-a1f8-abbfd9884b07", - "metadata": {}, - "outputs": [], - "source": [ - "from template.indicators import WorldBankIndicatorsAPI" - ] - }, - { - "cell_type": "markdown", - "id": "17f380e4-3854-4af6-940c-7afe9723a59a", - "metadata": {}, - "source": [ - "Let's continue by creating the API object. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f911a5c3-6994-45a6-a049-4b398f5890c0", - "metadata": {}, - "outputs": [], - "source": [ - "api = WorldBankIndicatorsAPI()" - ] - }, - { - "cell_type": "markdown", - "id": "7fa96741-f4cd-4504-a5f8-6467f9a2345e", - "metadata": {}, - "source": [ - "The `api` wrapper object is now ready to use! We will invoke its `query` method to retrieve data from the [World Bank Indicators API](https://datahelpdesk.worldbank.org/knowledgebase/articles/889392-about-the-indicators-api-documentation). To learn how to use it, such as information about method signature, valid parameters and return value, we read `help`. Since [PEP 257](https://peps.python.org/pep-0257), Python offers *doctrings*, which are an easy and standard to create code documentation and it is a good practice adopt it. Documentating the source code is crucial to create a maintainable reliable and reproducicle code base and project.\n", - "\n", - "Let's see the `query` method's *docstring* as shown below." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fb6ca314-d161-40e1-a376-a3013a0711eb", - "metadata": {}, - "outputs": [], - "source": [ - "help(api.query)" - ] - }, - { - "cell_type": "markdown", - "id": "e82fc342-165d-42d6-b3dc-7534c215ca1f", - "metadata": {}, - "source": [ - "The `query` method allows us to select an **indicator** (e.g, [World Development Indicators](https://datatopics.worldbank.org/world-development-indicators)), a list of countries and [query parameters](https://datahelpdesk.worldbank.org/knowledgebase/articles/898581#query-strings). Note that contrary to the [previous example](https://worldbank.github.io/template/notebooks/world-bank-api.html), the method expects a list of country names and converts them to [ISO 3166-1 alpha-3](https://www.iso.org/iso-3166-country-codes.html) automatically." - ] - }, - { - "cell_type": "markdown", - "id": "23b0a1eb-73c1-42e7-8903-98e362ef86de", - "metadata": {}, - "source": [ - "Let's invoke the `query` method and retrieve the results for `SP.POP.TOTL` for the [BRICS](https://infobrics.org) (as before)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7fb7daea-c5cf-42ea-b746-a565dd9ac4e1", - "metadata": {}, - "outputs": [], - "source": [ - "df = api.query(\n", - " \"SP.POP.TOTL\", country=[\"Brazil\", \"China\", \"India\", \"Russia\", \"South Africa\"]\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "46662c1b-4c19-424b-8a61-f651cb486c5b", - "metadata": {}, - "source": [ - "**Voilà!** We just (re)used the [template](https://github.com/worldbank/template/tree/main/src/template) Python package in our example delegating the maintenance and logic, making the notebook easier to understand and reproduce. \n", - "\n", - "```{tip}\n", - "In addition, the `template` makes any Python package automatically [pip installable](https://packaging.python.org/en/latest/tutorials/installing-packages/) and accessible to *anyone* and from *anywhere*!\n", - "\n", - "To install from source:\n", - "\n", - "\tpip install git+https://github.com/worldbank/template.git\n", - "\n", - "To install from version:\n", - "\n", - "\tpip install git+https://github.com/worldbank/template.git@v0.1.0\n", - "\t\n", - "\n", - "When distributing a project release, it is strongly recommended to adhere to release management good practices. It is recommended to create checklists, adopt versioning (e.g, [semantic versioning](https://semver.org/) and to release on [Python Package Index](https://pypi.org/) (instead of GitHub).\n", - "```" - ] - }, - { - "cell_type": "markdown", - "id": "80887da5-0474-48b3-8c71-fbe3dbd3a8e8", - "metadata": {}, - "source": [ - "```{tip}\n", - "The template will automatically find and install any local `src` packages as long as the `setup.cfg` file is up-to-date.\n", - "```" - ] - }, - { - "cell_type": "markdown", - "id": "daa4319a-8936-4195-b1fc-aad9c008325b", - "metadata": {}, - "source": [ - "```{caution}\n", - "The `template` Python package should be used for demonstration purposes only. For support, please see the [World Bank Indicators API Documentation](https://datahelpdesk.worldbank.org/knowledgebase/articles/889392-about-the-indicators-api-documentation).\n", - "```" - ] - }, - { - "cell_type": "markdown", - "id": "e9f14239", - "metadata": {}, - "source": [ - "Finally, let's take a look at the retrieved data." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b1d7cf70-bf0e-4c12-ae0d-fd26349291db", - "metadata": { - "tags": [ - "remove-cell" - ] - }, - "outputs": [], - "source": [ - "df = df.pivot_table(values=\"value\", index=\"date\", columns=\"country.value\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "699a0495-4f06-479c-b517-58336110547f", - "metadata": { - "tags": [ - "output_scroll" - ] - }, - "outputs": [], - "source": [ - "df" - ] - }, - { - "cell_type": "markdown", - "id": "c5daa85a-004d-4e93-be84-72d064d0b83b", - "metadata": {}, - "source": [ - "## Visualization\n", - "\n", - "As before, let's now plot the data as a time series using [Bokeh](https://docs.bokeh.org)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "60219760", - "metadata": {}, - "outputs": [], - "source": [ - "output_notebook()\n", - "\n", - "# instantiating the figure object\n", - "p = figure(title=\"Population, total (World Bank)\", width=700, height=600)\n", - "\n", - "# colors\n", - "colors = itertools.cycle(Spectral6)\n", - "\n", - "# plotting the line graph\n", - "for column, color in zip(df.columns, colors):\n", - " p.line(\n", - " df.index,\n", - " df[column],\n", - " legend_label=column,\n", - " color=color,\n", - " line_width=2,\n", - " )\n", - "\n", - "p.legend.location = \"right\"\n", - "p.legend.click_policy = \"mute\"\n", - "p.title.text_font_size = \"12pt\"\n", - "\n", - "p.xaxis.axis_label = \"Year\"\n", - "p.yaxis.axis_label = \"Population, total (in millions)\"\n", - "\n", - "show(p)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" - }, - "vscode": { - "interpreter": { - "hash": "b6702b69e93007336b96338c5a331192f07cedff01d36d4dcfa0f842adb718ad" - } - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/src/template/__init__.py b/src/template/__init__.py deleted file mode 100644 index a535a10..0000000 --- a/src/template/__init__.py +++ /dev/null @@ -1,7 +0,0 @@ -from importlib.metadata import version, PackageNotFoundError - -try: - __version__ = version("datalab") -except PackageNotFoundError: - # package is not installed - pass diff --git a/src/template/indicators.py b/src/template/indicators.py deleted file mode 100644 index b98824f..0000000 --- a/src/template/indicators.py +++ /dev/null @@ -1,83 +0,0 @@ -import pandas -import pycountry -import requests - - -class WorldBankIndicatorsAPI: - URL = "https://api.worldbank.org/v2/country" - - def _get_country_code(self, country): - """ - Using `pycountry`, return the ISO 3166-1 alpha-3 country code for corresponding query term. - - See also: - https://github.com/flyingcircusio/pycountry - - Parameters - ---------- - country : str - - Returns - ------- - str - ISO 3166-1 alpha-3 country code for corresponding query term. - - Raises - ------ - LookupError - If the query term is not a valid country. - """ - return pycountry.countries.search_fuzzy(country)[0].alpha_3 - - def _get(self, indicator, country: str = "all", params: dict = {}): - """ - Retrieve a response, valid JSON response or error, from the World Bank Indicators API. - - See also: - https://datahelpdesk.worldbank.org/knowledgebase/articles/889392-about-the-indicators-api-documentation - - Parameters - ---------- - indicator : str - country : str, optional - params : dict, optional - - Returns - ------- - requests.models.Response - Return JSON response from the World Bank Indicators API. - """ - url = f"{self.URL}/{country}/indicator/{indicator}" - - return requests.get(url, params) - - def query(self, indicator, country: list = "all", params: dict = {}): - """ - Retrieve a response, valid JSON response or error, from the World Bank Indicators API. - - See also: - https://datahelpdesk.worldbank.org/knowledgebase/articles/889392-about-the-indicators-api-documentation - - Parameters - ---------- - indicator : str - World Bank API Indicator. - country : list, optional - List of countries. The country name is converted to ISO 3166-1 alpha-3 country code. - params : dict, optional - World Bank API Indicator Query Strings. - - Returns - ------- - pandas.core.frame.DataFrame - Return a Pandas DataFrame obtained with response data from World Bank Indicators API. - """ - if isinstance(country, list): - country = ";".join([self._get_country_code(c) for c in country]) - - params.update({"format": "json", "per_page": 1000}) - - response = self._get(indicator, country, params) - data = response.json()[-1] - - return pandas.json_normalize(data) diff --git a/src/tunisia/streamlit_app.py b/src/tunisia/streamlit_app.py new file mode 100644 index 0000000..4f47b99 --- /dev/null +++ b/src/tunisia/streamlit_app.py @@ -0,0 +1,81 @@ +import streamlit as st +from langchain.llms import OpenAI +import os +from pathlib import Path +from langchain_community.llms import HuggingFaceEndpoint, HuggingFaceHub +from langchain.prompts import PromptTemplate, ChatPromptTemplate +from langchain.chains import LLMChain, ConversationChain, RetrievalQA, RetrievalQAWithSourcesChain +from langchain_openai import ChatOpenAI, OpenAI +from langchain.memory import ChatMessageHistory, ConversationBufferMemory, ConversationSummaryMemory + +# Document Loaders and Text Splitter +from langchain_community.document_loaders import PyPDFLoader, CSVLoader, HNLoader, UnstructuredHTMLLoader +from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter + +# Vector Embeddings +from langchain_openai import OpenAIEmbeddings +from langchain_community.vectorstores import Chroma + +from langchain_core.runnables import RunnablePassthrough +from langchain.schema.output_parser import StrOutputParser + +# App Logic +st.title('🦜🔗 RAG Based Chatbot') + +OPENAI_API_KEY = st.sidebar.text_input('OpenAI API Key', type='password') +DIR_WD = Path("/Users/dunstanmatekenya/Google Drive/My Drive/GenAI-Course/Mod2-LLM-Overview/") +DIR_DATA = DIR_WD.joinpath("data") +DIR_DOCS = Path("/Users/dunstanmatekenya/Google Drive/My Drive/GenAI-Course/Public Health Documents") +FILE_TU_COVID_RESPONSE = DIR_DOCS.joinpath("who_wou_apr_2024.pdf") + +def load_pdf_docs(pdf_file): + # Load the PDF + loader = PyPDFLoader(pdf_file) + data = loader.load() + + chunk_size = 400 + chunk_overlap = 100 + + rc_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap = chunk_overlap) + docs = rc_splitter.split_documents(data) + + return docs + +def add_documents2vectordb(embedding_model, pdf_file, vectordb_dir): + + # Load and split the file + docs = load_pdf_docs(pdf_file) + + # Create embeddings + embedding_model = OpenAIEmbeddings(openai_api_type=OPENAI_API_KEY) + + + vectordb = Chroma(persist_directory=vectordb_dir, embedding_function=embedding_model) + + vectordb.persist() + docstorage = Chroma.from_documents(docs, embedding_model) + + return docstorage + +def generate_response(input_text): + embedding_function = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY) + docstorage = add_documents2vectordb(embedding_model=embedding_function, + pdf_file=str(FILE_TU_COVID_RESPONSE), + vectordb_dir=str(DIR_DATA)) + print('Done with Preparing Documents') + llm = OpenAI(model_name="gpt-3.5-turbo-instruct", openai_api_key=OPENAI_API_KEY, + temperature=0.7) + qa = RetrievalQAWithSourcesChain.from_chain_type(llm=llm,chain_type="stuff", + retriever=docstorage.as_retriever()) + results = qa({"question": "{}".format(input_text)}, + return_only_outputs=True) + print(results) + st.info(results) + +with st.form('my_form'): + text = st.text_area('Enter text:', 'What is the situation of drought in the Amazon forest?') + submitted = st.form_submit_button('Submit') + if not OPENAI_API_KEY.startswith('sk-'): + st.warning('Please enter your OpenAI API key!', icon='⚠') + if submitted and OPENAI_API_KEY.startswith('sk-'): + generate_response(text) \ No newline at end of file