diff --git a/.all-contributorsrc b/.all-contributorsrc new file mode 100644 index 0000000..cfac036 --- /dev/null +++ b/.all-contributorsrc @@ -0,0 +1,4 @@ +{ + "projectName": "TemporalScope", + "projectOwner": "philip-ndikum" +} diff --git a/.gitignore b/.gitignore index a49407f..eb153ed 100644 --- a/.gitignore +++ b/.gitignore @@ -30,10 +30,8 @@ share/python-wheels/ *.egg MANIFEST -# Sphinx documentation -docs/_build/* -docs/.doctrees/ -docs/.buildinfo +# Mkdocs documentation +/site/ # Jupyter / IPython .ipynb_checkpoints/ diff --git a/.licenserc.yaml b/.licenserc.yaml index 0d1d757..525b209 100644 --- a/.licenserc.yaml +++ b/.licenserc.yaml @@ -22,6 +22,8 @@ header: - '**/*.sh' - '.gitkeep' - '.env.example' + - '.all-contributorsrc' + - 'scripts/gen_ref_docs.py' comment: on-failure diff --git a/.readthedocs.yaml b/.readthedocs.yaml index efec001..83615df 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -1,4 +1,4 @@ -# Read the Docs configuration file for Sphinx projects +# Read the Docs configuration file for Mkdocs projects # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details version: 2 @@ -15,6 +15,5 @@ python: extra_requirements: - docs -sphinx: - configuration: docs/conf.py - fail_on_warning: true +mkdocs: + configuration: mkdocs.yml diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index b1e3ad6..3ff63db 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,4 +1,5 @@ ## Table of Contents + - [Contributing to TemporalScope](#contributing-to-temporalscope) - [Contribution Guidelines](#contribution-guidelines) - [How to Contribute to TemporalScope](#how-to-contribute-to-temporalscope) @@ -19,117 +20,111 @@ - [Workflow for Releasing New Versions](#workflow-for-releasing-new-versions) - [Code Style](#code-style) - [Reporting Issues \& Requesting Features](#reporting-issues--requesting-features) + -# Contributing to TemporalScope +# Contributing to TemporalScope ⌨️ -Thank you for your interest in contributing to TemporalScope! Contributions of all kinds are welcome and appreciated. +Thank you for your interest in contributing to TemporalScope! We welcome and appreciate contributions of all types. This guide is designed to help you get started with the contribution process. +--- ## Contribution Guidelines -> [!IMPORTANT] -> By contributing to this project, you are agreeing to the following conditions: - -1. **Adherence to the Apache License 2.0**: - - All contributions must comply with the [Apache License 2.0](LICENSE). - - You must ensure that all work contributed is your own original creation, fully independent, or obtained through publicly available academic literature or properly licensed third-party sources. -2. **Conflict of Interest and Independence**: - - By contributing, you affirm that your contributions do not conflict with any other proprietary agreements, employment contracts, or other legal obligations you may have. - - You agree that all work contributed does not infringe on any third-party rights or violate any agreements you are bound to (such as non-compete or non-disclosure agreements). -3. **Academic Integrity and Citation**: - - If your contributions are based on academic literature or third-party software, you must properly cite and reference these sources in compliance with the principles of academic integrity and the terms of the Apache License. -4. **Workflow Compliance**: - - All contributors must comply with the project's defined workflow protocols, including adhering to test, security, and deployment workflows, as outlined in the [OpenSSF Best Practices](https://openssf.org/). -5. **Liability and Responsibility**: - - Contributors assume full responsibility for the originality and legality of their contributions and will hold harmless the project maintainers from any claims arising from legal conflicts, breaches, or intellectual property issues related to their contributions. - -We are excited to have your contributions but ask that you follow these guidelines to ensure the project remains legally sound and academically rigorous. +> WARNING: **Important** +> By contributing to this project, you are agreeing to the following conditions -# How to Contribute to TemporalScope +- **Adherence to the Apache License 2.0:** + - All contributions must comply with the Apache License 2.0. + - You must ensure that all work contributed is your own original creation, fully independent, or obtained through publicly available academic literature or properly licensed third-party sources. +- **Conflict of Interest and Independence**: + - By contributing, you affirm that your contributions do not conflict with any other proprietary agreements, employment contracts, or other legal obligations you may have. + - You agree that all work contributed does not infringe on any third-party rights or violate any agreements you are bound to (such as non-compete or non-disclosure agreements). +- **Academic Integrity and Citation:** + - If your contributions are based on academic literature or third-party software, you must properly cite and reference these sources in compliance with the principles of academic integrity and the terms of the Apache License. +- **Workflow Compliance:** + - All contributors must comply with the project's defined workflow protocols, including adhering to test, security, and deployment workflows, as outlined in the [OpenSSF Best Practices](https://openssf.org/). +- **Liability and Responsibility:** + - Contributors assume full responsibility for the originality and legality of their contributions and will hold harmless the project maintainers from any claims arising from legal conflicts, breaches, or intellectual property issues related to their contributions. -We welcome contributions to TemporalScope! This guide will help you get started with the contribution process. +Thank you for your understanding and cooperation. We look forward to your contributions! -# Issue Tracking +--- +## Issue Tracking We use [GitHub Issues](https://github.com/philip-ndikum/TemporalScope/issues) to track bugs, enhancements, features, and refactoring suggestions. To propose something new: -1. Open an Issue -2. Describe the issue: +**Open an Issue:** - - Expected behavior - - Actual behavior - - Context and reproduction steps +- Describe the issue (expected behavior, actual behavior, context and reproduction steps) +- Provide a rough implementation or pseudo code if necessary +- Include any relevant information you've collected -3. Provide a rough implementation or pseudo code if necessary -4. Include any relevant information you've collected +**After submission:** -After submission: +- The project team will label the issue +- A team member will attempt to reproduce the issue (If reproduction steps are unclear, you may be asked for more details) +- Reproducible issues will be scheduled for a fix, or left open for community implementation -1. The project team will label the issue -2. A team member will attempt to reproduce the issue - - If reproduction steps are unclear, you may be asked for more details -3. Reproducible issues will be: - - Scheduled for a fix, or left open for community implementation - -Guidelines for effective issue reports: +**Guidelines for effective issue reports:** - Be as specific as possible - Include code samples when relevant - Describe your environment (OS, python version, etc.) if applicable - Use clear, concise language -# Contributing Code +--- +## Contributing Code -## Fork the Repository +### Fork the Repository 1. Create your own [fork](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/fork-a-repo) of the TemporalScope repository. 2. Clone a local copy of your fork: - ```console - $ git clone git@github.com:YOUR-USERNAME/TemporalScope.git - ``` + ```console + git clone git@github.com:YOUR-USERNAME/TemporalScope.git + ``` - or + or - ```console - $ git clone https://github.com/YOUR-USERNAME/TemporalScope.git - ``` + ```console + git clone https://github.com/YOUR-USERNAME/TemporalScope.git + ``` -## Setup Development Environment +### Setup Development Environment TemporalScope uses [Hatch](https://hatch.pypa.io/latest/), a Python project manager, for managing virtual environments, building the project, and publishing packages. 1. Install Hatch by following the [installation instructions](https://hatch.pypa.io/latest/install/) for your operating system. 2. Verify the installation (your version number may differ): - ```console - $ cd TemporalScope - $ hatch version - 0.1.0 - ``` + ```console + $ cd TemporalScope + $ hatch version + 0.1.0 + ``` -## Install Pre-commit Hooks +### Install Pre-commit Hooks We use [pre-commit](https://pre-commit.com/) hooks to ensure code quality and consistency. Set up the git hook scripts: ```console -$ pre-commit install --hook-type commit-msg --hook-type pre-push +pre-commit install --hook-type commit-msg --hook-type pre-push ``` -## Create a New Branch +### Create a New Branch Create a new branch with a descriptive name for your changes: ```console -$ git switch -c +git switch -c ``` -## Make Your Changes +### Make Your Changes 1. Make the necessary code changes in your local repository. 2. Write or update tests as needed. 3. Update documentation if you're introducing new features or changing existing functionality. -## Ensure Code Quality +### Ensure Code Quality TemporalScope employs various tools to maintain consistent code style, quality, and static type checking. While the CI pipeline tests code quality, running these checks locally expedites the review cycle. @@ -137,61 +132,55 @@ Before submitting your changes, perform the following steps: 1. Run the test suite: -```console -$ hatch run test:unit -``` -```console -$ hatch run test:integration -``` + ```console + hatch run test:unit + ``` + + ```console + hatch run test:integration + ``` 2. Check your code format: -```console -$ hatch run format-check -``` + ```console + hatch run format-check + ``` 3. Format your code (if needed): -```console -$ hatch run format -``` + ```console + hatch run format + ``` 4. Check your code style according to linting rules: -```console -$ hatch run check -``` + ```console + hatch run check + ``` 5. Automatically fix some errors (when possible): -```console -$ hatch run fix -``` + ```console + hatch run fix + ``` -6. All checks (managed by pre-commit) -```console -$ hatch run quality-assurance -``` -> [!NOTE] +> NOTE: > Running these checks locally will help identify and resolve issues before submitting your changes, streamlining the review process. -Here's the revised version of that section: - -## Commit Your Changes +### Commit Your Changes 1. Stage your changes: - ```console - $ git add [args] - ``` - + ```console + git add [args] + ``` 2. Commit your changes with a descriptive commit message. Follow the [Conventional Commits](https://www.conventionalcommits.org/en/v1.0.0/) specification: - ```console - $ git commit -m "[optional scope]: " - ``` + ```console + git commit -m "[optional scope]: " + ``` Example commit messages: @@ -199,25 +188,24 @@ Here's the revised version of that section: - `fix(api): resolve data parsing error` - `docs: update README with new configuration options` - Alternatively, [Commitizen](https://commitizen-tools.github.io/commitizen/) is a handy tool for crafting commit messages that follow the Conventional Commit format. Simply run the command: - ```bash - $ cz commit - ``` +```console +cz commit +``` and follow the interactive prompts. Commitizen will generate a commit message that complies with the required standards. -> [!NOTE] +> NOTE: > If you've set up pre-commit hooks as recommended, they will automatically run various checks before finalizing your commit. This helps ensure code quality and consistency. -## Submit a Pull Request +### Submit a Pull Request 1. Push your changes to your fork: - ```console - $ git push origin - ``` + ```console + git push origin + ``` 2. Go to the [TemporalScope repository](https://github.com/philip-ndikum/TemporalScope) on GitHub. 3. Click on "Pull requests" and then "New pull request". @@ -225,13 +213,15 @@ Here's the revised version of that section: 5. Fill out the pull request template with details about your changes. 6. Submit the pull request. -> [!TIP] +> SUCCESS: **Tip** > To ease the review process, please follow the instructions: > > - For the title, use the [conventional commit convention](https://www.conventionalcommits.org/en/v1.0.0/). -> - For the body, follow the existing [pull request template](<(https://github.com/philip-ndikum/TemporalScope/blob/main/.github/pull_request_template.md)>). Describe and document your changes. +> - For the body, follow the existing pull request template. Describe and document your changes. +> - Ensure test, formatting, and linting pass locally before submitting your pull request. +> - Include any relevant information that will help reviewers understand your changes. -## After Submitting +### After Submitting - Respond to any feedback or questions from reviewers. - Make additional changes if requested. @@ -239,49 +229,64 @@ Here's the revised version of that section: Thank you for contributing to TemporalScope! +--- ## Documentation -TemporalScope utilizes [Sphinx](https://www.sphinx-doc.org/en/master/) for its documentation. + +TemporalScope utilizes [Mkdocs](https://www.mkdocs.org/) for its documentation. To build the docs locally run the following command + ```console -$ cd docs -$ hatch run docs:build +hatch run docs:build ``` + To view the docs locally run the following command + ```console -$ cd docs -$ hatch run docs:serve +hatch run docs:serve ``` -API documentation is automatically generated using the [sphinx-autoapi](https://sphinx-autoapi.readthedocs.io/en/latest/) extension. It extracts and builds the documentation directly from the code’s docstrings, which are written in reStructuredText (reST) format. +API documentation is automatically generated using the [mkdocstrings](https://mkdocstrings.github.io/) extension. It extracts and builds the documentation directly from the code’s docstrings, which are written in Numpy format. -## Test Policy - -TemporalScope prioritizes code quality, security, and stability. To uphold these standards: +- **Content**: The documentation is written in Markdown and stored in the `docs` directory. +- **API Documentation**: API references are automatically generated using the [mkdocstrings](https://mkdocstrings.github.io/) extension, which extracts information directly from the code's docstrings. +- **Reference Pages**: The script `scripts/gen_ref_pages.py` dynamically generates API reference pages during each build, enabling fully automated and hands-off API documentation. -1. New functionality should include corresponding tests. -2. [PyTest](https://docs.pytest.org/en/stable/) is our primary testing framework. -3. Required test types: +Documentation is hosted on [Read the Docs](https://readthedocs.org/projects/temporalscope/) and is automatically updated with each commit to the main branch. +PRs also provide a preview of the documentation changes. - - Unit tests for all new features - - Integration tests where applicable +--- -4. Test coverage should include: +## Testing - - Main functionality - - Edge cases - - Boundary conditions - -5. Before pushing code: - - - Ensure all test pass locally - -6. Test guidelines: - - Keep tests simple and easy to understand - - Follow PyTest [best practices](https://emimartin.me/pytest_best_practices) - -This policy helps ensure code integrates well with the existing codebase and prevents future bugs or regressions. +TemporalScope prioritizes code quality, security, and stability. To uphold these standards: +- New functionality should include corresponding tests. +- [PyTest](https://docs.pytest.org/en/stable/) is our primary testing framework. +- Required test types: + - Unit tests for all new features + - Integration tests where applicable +- Test coverage should include: + - Main functionality + - Edge cases + - Boundary conditions +- Before pushing code: + - Ensure all test pass locally + - Run formatting and linting checks +- Test guidelines: + - Keep tests simple and easy to understand + - Follow PyTest [best practices](https://emimartin.me/pytest_best_practices) + +> TIP: +> If you are unfamiliar with PyTest, the [official documentation](https://docs.pytest.org/en/stable/) provides a comprehensive guide to writing and running tests. +> Additionally, the [PyTest Quick Start Guide](https://docs.pytest.org/en/stable/getting-started.html) offers a quick introduction to the framework. +--- +## Coverage + +Coverage reports are generated with [pytest-cov](https://github.com/pytest-dev/pytest-cov) and are available for viewing on [Coveralls](https://coveralls.io/github/philip-ndikum/TemporalScope). +Coveralls provides comments on pull requests, indicating changes in coverage and highlighting areas that need additional testing. + +--- ## Development Roadmap & Changelog **TemporalScope** follows **[Semantic Versioning (SemVer)](https://semver.org/)**, a versioning system that conveys the scope of changes introduced in each new release. Each version is represented in the form **MAJOR.MINOR.PATCH**, where major releases introduce significant or breaking changes, minor releases add backward-compatible functionality, and patch releases are used for bug fixes or minor improvements. Below is the planned roadmap outlining feature development and milestones over the next 12–18 months. This roadmap is subject to change based on user feedback, emerging research, and community contributions. @@ -294,40 +299,48 @@ This policy helps ensure code integrates well with the existing codebase and pre | **0.5.0** | Planned | Focused on achieving a stable release. This version will include extensive user testing, bug fixes, and performance optimizations after several months of use in diverse environments. | | **1.0.0** | Stable | The first fully stable release, with robust documentation, thorough testing, and any feedback-driven refinements. This version will be ready for broader production use and long-term support. | +--- ## Workflow for Releasing New Versions In order to maintain consistency and clarity across different distribution platforms like **PyPI**, **Conda**, and **GitHub**, we follow a structured workflow for releasing new versions: -1. **Update the `CHANGELOG.md` File**: - - Ensure that all the changes (new features, bug fixes, deprecations, and breaking changes) are accurately recorded in the `CHANGELOG.md`. - - Each release should include a brief summary of changes, structured by categories like **Features**, **Fixes**, and **Breaking Changes**. -2. **Generate Release Notes**: - - Use the information from the `CHANGELOG.md` to create consistent **release notes**. - - Ensure that the release notes are in a uniform format for each platform: - - **PyPI**: Include a summary of the changes in the release description. - - **Conda**: Similar release notes can be included when publishing to **Conda**. - - **GitHub**: Publish the release notes in the **GitHub Releases** section. -3. **Distribute to Each Platform**: - - **PyPI**: Push the package using `hatch` after running the necessary build commands. - - **Conda**: Ensure the package is properly built for **Conda** and distributed to the Conda package manager. - - **GitHub**: Create a **GitHub Release**, attaching the release notes, and tagging the release in the repository. -4. **Verify the Release**: - - Ensure all distribution platforms (PyPI, Conda, GitHub) reflect the new release. - - Test the installation via `pip install temporalscope` and `conda install temporalscope` to ensure everything works as expected. - -By following this workflow, we ensure a consistent and smooth release process across all distribution channels, providing users with clear updates and robust software. We use HTTPS and SSH to protect against **man-in-the-middle (MITM) attacks** during the delivery process. However, **digital signatures** are not currently implemented, as the current security measures are sufficient for the project's scope. We will revisit this as the project scales and requires additional security layers. - +1. Update the `CHANGELOG.md` File: + - Ensure that all the changes (new features, bug fixes, deprecations, and breaking changes) are accurately recorded in the `CHANGELOG.md`. + - Each release should include a brief summary of changes, structured by categories like **Features**, **Fixes**, and **Breaking Changes**. +2. Generate Release Notes: + - Use the information from the `CHANGELOG.md` to create consistent release notes. + - Ensure that the release notes are in a uniform format for each platform: + - PyPI: Include a summary of the changes in the release description. + - Conda: Similar release notes can be included when publishing to Conda. + - GitHub: Publish the release notes in the GitHub Releases section. +3. Distribute to Each Platform: + - PyPI: Push the package using hatch after running the necessary build commands. + - Conda: Ensure the package is properly built for Conda and distributed to the Conda package manager. + - GitHub: Create a GitHub Release, attaching the release notes, and tagging the release in the repository. +4. Verify the Release: + - Ensure all distribution platforms (PyPI, Conda, GitHub) reflect the new release. + - Test the installation via `pip install temporalscope` and `conda install temporalscope` to ensure everything works as expected. + +--- ## Code Style We strictly enforce code quality and style to ensure the stability and maintainability of the project. -- **[Ruff](https://docs.astral.sh/ruff)** formatting and linting -- **[Mypy](https://mypy.readthedocs.io/en/stable/)** type checking +- **[Ruff](https://docs.astral.sh/ruff)** formatting and linting. +- **[Mypy](https://mypy.readthedocs.io/en/stable/)** type checking. - **[PEP 8](https://peps.python.org/pep-0008/)** guidelines are followed for Python code style. -- **[Sphinx-style docstrings](https://sphinx-rtd-tutorial.readthedocs.io/en/latest/docstrings.html)** with type hints are required to conform to **MyPy** standards, enabling early error detection. +- **[Numpy-style docstrings](https://numpydoc.readthedocs.io/en/latest/format.html)** with type hints are required to conform to the project's documentation standards. - Write clear and concise commit messages. Adhere to [conventional commit convention](https://www.conventionalcommits.org/en/v1.0.0/). - Include comments and docstrings where necessary to improve code readability. +Once Hatch and pre-commit are installed, checks run automatically before each commit, ensuring your code meets project standards. The CI pipeline also verifies these checks before merging. + +> TIP: +Most IDEs and text editors have plugins to help adhere these standards. + +--- ## Reporting Issues & Requesting Features -If you encounter any bugs or issues, please read our `SECURITY.md` for instructions on managing security issues. Alternatively, utilize the Github Discussions to raise issues or potential long term features. +If you encounter any bugs or issues, please read our `SECURITY.md` for instructions on managing security issues. Alternatively, utilize Github issues to report a bug or potential long term feature request. + + diff --git a/README.md b/README.md index e4b0641..9148eb2 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,8 @@

- TemporalScope Logo + TemporalScope Logo

+

Scientifically driven Model-Agnostic Temporal Feature Importance Analysis

@@ -27,59 +28,89 @@

--- - - - -
- - - - - - - - - - - +
CompatibilityLicenseMetaBuild ToolsCI/CDSecurity
- - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + -
- Python Version
- Linux Compatible -
- License - - Ruff
- Checked with mypy -
- Hatch project - - pre-commit.ci status
- - Coverage Status -
- OpenSSF Best Practices
- Security: Bandit -
Compatibility + + Python Version + + + Linux Compatible + +
Docs + + Documentation Status + +
Meta + + Ruff + + + Checked with mypy + + + License + + All Contributors +
Build Tools + + Hatch project + +
CI/CD + + pre-commit.ci status + + + GitHub Actions + + + Coverage Status + +
Security + + OpenSSF Best Practices + + + Security: Bandit + +
-
+ ---- -**TemporalScope** is an open-source Python package designed to bridge the gap between scientific research and practical industry applications for analyzing the temporal dynamics of feature importance in AI & ML time series models. Developed in alignment with Linux Foundation standards and licensed under Apache 2.0, it builds on tools such as Boruta-SHAP and SHAP, using modern window partitioning algorithms to tackle challenges like non-stationarity and concept drift. This package is flexible and extensible, supporting frameworks like **Pandas, Polars, Modin, Dask, and PyArrow** via **native Narwhals compatibility**. Additionally, the optional _Clara LLM_ modules (etymology from the word _Clarity_) are intended to serve as a model-validation tool to support explainability efforts (XAI). **Note**: TemporalScope is currently in **beta and pre-release** phase, so some installation methods may not work as expected on all platforms. Please check the `CONTRIBUTIONS.md` for the full roadmap. +**TemporalScope** is an open-source Python package designed to bridge the gap between scientific research and practical industry applications for analyzing the temporal dynamics of feature importance in AI & ML time series models. Developed in alignment with Linux Foundation standards and licensed under Apache 2.0, it builds on tools such as Boruta-SHAP and SHAP, using modern window partitioning algorithms to tackle challenges like non-stationarity and concept drift. - +**TemporalScope** is an open-source Python package designed to bridge the gap between scientific research and practical industry applications for analyzing the temporal dynamics of feature importance in AI & ML time series models. Developed in alignment with Linux Foundation standards and licensed under Apache 2.0, it builds on tools such as Boruta-SHAP and SHAP, using modern window partitioning algorithms to tackle challenges like non-stationarity and concept drift. -### **Table of Contents** +This package is flexible and extensible, supporting frameworks like **Pandas, Polars, Modin, Dask, and PyArrow** via **native Narwhals compatibility**. Additionally, the optional _Clara LLM_ modules (etymology from the word _Clarity_) are intended to serve as a model-validation tool to support explainability efforts (XAI). +> NOTE: TemporalScope is currently in **beta and pre-release** phase, so some installation methods may not work as expected on all platforms. + + +### Table of Contents - [**Installation**](#installation) - [**Usage**](#usage) @@ -90,57 +121,62 @@ - [Cite this Project](#cite-this-project) - [**License, Limitations, and Legal Notice**](#license-limitations-and-legal-notice) -## **Installation** -**Note**: TemporalScope is currently in **beta**, so some installation methods may not work as expected on all platforms. +## Installation + + +> WARNING: TemporalScope is currently in **beta**, so some installation methods may not work as expected on all platforms. -1. **Basic Installation using pip**: You can install the core package using pip: +TemporalScope can be installed using the following methods: + +1. **via pip (recommended)**: You can install the core package using pip: ```console $ pip install temporalscope ``` -2. **Installation with conda**: For conda users, install via conda-forge: +2. **with conda**: For conda users, install via conda-forge: ```console $ conda install -c conda-forge temporalscope ``` -3. **System-level Dependencies**: To view generated documentation locally, you may need `xdg-open`: - ```console - $ sudo apt install xdg-utils - ``` -4. **Git Clone and Setup**: For security reasons, we minimize system-level dependencies. If you prefer the latest development version, follow these steps to clone the repository and set up the project using Hatch: +3. **git clone and setup (advanced)**: If you prefer the latest development version, follow these steps to clone the repository and set up the project using Hatch: - ```console - $ git clone https://github.com/philip-ndikum/TemporalScope.git - $ cd TemporalScope - $ hatch shell - ``` + ```console + $ git clone https://github.com/philip-ndikum/TemporalScope.git + $ cd TemporalScope + $ hatch shell + ``` + This process clones the repository, navigates to the project directory, and uses Hatch to create and activate a virtual environment with the project installed in development mode. + - This process clones the repository, navigates to the project directory, and uses Hatch to create and activate a virtual environment with the project installed in development mode. - -## **Usage** +## Usage + You can use TemporalScope with the following steps: 1. **Import TemporalScope**: Start by importing the package. 2. **Select Backend (Optional)**: TemporalScope defaults to using Pandas as the backend. However, you can specify other backends like Dask, Modin, or CuDF. 3. **Load Data**: Load your time series data into the `TimeSeriesData` class, specifying the `time_col` and optionally the `id_col`. 4. **Apply a Feature Importance Method**: TemporalScope defaults to using a Random Forest model from scikit-learn if no model is specified. You can either: - - **A. Use a pre-trained model**: Pass a pre-trained model to the method. - - **B. Train a Random Forest model within the method**: TemporalScope handles model training and application automatically. + - **A. Use a pre-trained model**: Pass a pre-trained model to the method. + - **B. Train a Random Forest model within the method**: TemporalScope handles model training and application automatically. 5. **Analyze and Visualize Results**: Interpret the results to understand how feature importance evolves over time or across different phases. Now, let's refine the code example using a random forest model and an academic dataset. We'll use the California housing dataset as a simple example since it's well-known and accessible. - ```python import polars as pl import pandas as pd from statsmodels.datasets import macrodata + from temporalscope.core.temporal_data_loader import TimeFrame from temporalscope.partitioning.naive_partitioner import NaivePartitioner from temporalscope.core.temporal_model_trainer import TemporalModelTrainer # 1. Load the dataset using Pandas (or convert to Polars) macro_df = macrodata.load_pandas().data -macro_df['time'] = pd.date_range(start='1959-01-01', periods=len(macro_df), freq='Q') +macro_df['time'] = pd.date_range( + start='1959-01-01', + periods=len(macro_df), + freq='Q' +) # Convert the Pandas DataFrame to a Polars DataFrame macro_df_polars = pl.DataFrame(macro_df) @@ -177,8 +213,9 @@ for partition_name, predictions in results.items(): print(f"Predictions for {partition_name}:") print(predictions[:5]) # Display first 5 predictions ``` + -### **Industrial Academic Applications** +### Industrial Academic Applications **DISCLAIMER**: The following use cases are provided for academic and informational purposes only. TemporalScope is intended to support research and development in understanding temporal dynamics in feature importance. These examples are not intended as guidance for industrial applications without further validation and expert consultation. The use of TemporalScope in any industrial or production environment is at the user's own risk, and the developers disclaim any liability for such use. Please refer to the [License and Legal Notice](#license-and-legal-notice) for further details. @@ -195,33 +232,31 @@ For more detailed examples from sectors like engineering and other scientific ap For detailed test, security, and deployment workflows as defined by OpenSSF Best Practices, please refer to [CONTRIBUTING.md](CONTRIBUTING.md). **TemporalScope** follows **Semantic Versioning (SemVer)**, a versioning system that conveys the scope of changes introduced in each new release. Each version is represented in the form **MAJOR.MINOR.PATCH**, where major releases introduce significant or breaking changes, minor releases add backward-compatible functionality, and patch releases are used for bug fixes or minor improvements. Below is the planned roadmap outlining feature development and milestones over the next 12–18 months. This roadmap is subject to change based on user feedback, emerging research, and community contributions. -## **Contributing** +## Contributing TemporalScope was conceived independently by [Philip Ndikum](https://github.com/philip-ndikum), [Serge Ndikum](https://github.com/serge-ndikum), and [Kane Norman](https://github.com/kanenorman) and has since been open-sourced to the broader academic and developer community. As the software continues to grow and evolve, it relies heavily on the active participation and contributions of its users. We encourage contributions from developers, researchers, and data scientists who are passionate about advancing open-source tools. Whether you are interested in extending the package’s functionality, fixing bugs, or improving documentation, your contributions are vital to the project’s ongoing success. For detailed guidelines on how to contribute, please refer to our [CONTRIBUTING.md](CONTRIBUTING.md). By working together, we can ensure that TemporalScope remains an innovative and reliable tool, continuously refined through community collaboration. -### **Contributors 💠** + -Thanks to these wonderful people: +## Contributors +Thanks to these wonderful people who have contributed to TemporalScope: - - - - - - - - -
Philip Ndikum
Philip Ndikum

💻
Serge Ndikum
Serge Ndikum

💻
Kane Norman
Kane Norman

💻
+ + + + -## Cite this Project + -If you use **TemporalScope** in your research, please consider citing it: +## Cite this Project + +If you use TemporalScope in your research, please consider citing it: ```bibtex @software{ndikum2024temporalscope, @@ -233,13 +268,19 @@ If you use **TemporalScope** in your research, please consider citing it: url = {https://github.com/philip-ndikum/TemporalScope} } ``` + -## **License, Limitations, and Legal Notice** -**TemporalScope** is primarily an academic tool designed for research and informational purposes. Practitioners and users of this software are strongly encouraged to consult the accompanying [SCIENTIFIC_LITERATURE.md](SCIENTIFIC_LITERATURE.md) document to fully understand the theoretical limitations, assumptions, and context of the techniques implemented within this package. Furthermore, use of this software falls under "as-is" software as defined by the [Apache License 2.0](LICENSE) provided in this repository and outlined below. +## License, Limitations, and Legal Notice + +TemporalScope is primarily an academic tool designed for research and informational purposes. Practitioners and users of this software are strongly encouraged to consult the accompanying scientific literature to fully understand the theoretical limitations, assumptions, and context of the techniques implemented within this package. Furthermore, use of this software falls under "as-is" software as defined by the Apache License 2.0 provided in this repository. -By using this package, you agree to comply with the terms and conditions set forth in the **Apache License 2.0**. +By using this package, you agree to comply with the terms and conditions set forth in the Apache License 2.0. + -**LEGAL NOTICE**: THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT, OR OTHERWISE, ARISING FROM, OUT OF, OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +### Legal Notice +THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT, OR OTHERWISE, ARISING FROM, OUT OF, OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. THIS SOFTWARE IS INTENDED FOR ACADEMIC AND INFORMATIONAL PURPOSES ONLY. IT SHOULD NOT BE USED IN PRODUCTION ENVIRONMENTS OR FOR CRITICAL DECISION-MAKING WITHOUT PROPER VALIDATION. ANY USE OF THIS SOFTWARE IS AT THE USER'S OWN RISK. + diff --git a/SCIENTIFIC_LITERATURE.md b/SCIENTIFIC_LITERATURE.md index 6039f3f..5101ec9 100644 --- a/SCIENTIFIC_LITERATURE.md +++ b/SCIENTIFIC_LITERATURE.md @@ -1,4 +1,4 @@ -## Engineering Design +## Literature This document lists key literature that has informed the development of this package. Please note that this is not a conclusive list but highlights the most relevant works. Our design is explicitly built for flexibility, unlike other time series machine learning and deep learning packages that often enforce rigid preprocessing constraints. We intentionally adopt familiar software engineering patterns, inspired by scikit-learn, to provide a modular and adaptable framework. The only assumption we impose is that features must be organized in a context window prior to the target variable. This allows users to focus on their core applications while ensuring compatibility with SHAP and other explainability methods. @@ -16,7 +16,7 @@ This document lists key literature that has informed the development of this pac | **Scientific Literature** | [Universal Time-Series Representation Learning: A Survey](https://arxiv.org/abs/2401.03717) | Trirat, P., Shin, Y., Kang, J., et al. | arXiv preprint, 2024 | Provides a comprehensive survey of universal models for time series, outlining how generalization across datasets is achieved with minimal assumptions. | -### Partitioning Guidelines +## Partitioning Guidelines The following heuristics are derived from key papers in the field and are designed to ensure that data partitions used in temporal analysis are robust and appropriate for machine learning tasks. @@ -32,6 +32,6 @@ The following heuristics are derived from key papers in the field and are design | **Binary numerical features** | Convert to categorical if 2 unique values | Properly categorizes binary features to ensure that the models interpret them correctly. | [Grinsztajn et al. (2022)](https://arxiv.org/pdf/2207.08815) | | **Class balance (for classification)** | Equal samples per class | Ensures that the learning problem is balanced, which is crucial for model accuracy and fairness. | [Grinsztajn et al. (2022)](https://arxiv.org/pdf/2207.08815) | -### **Contributor Guidelines** +## Contributor Guidelines Contributors are encouraged to reference relevant literature when making contributions. Please ensure that the appropriate citations are included in this document and the codebase where applicable. diff --git a/docs/citation.md b/docs/citation.md new file mode 100644 index 0000000..6c2b7f0 --- /dev/null +++ b/docs/citation.md @@ -0,0 +1,10 @@ +--- +icon: material/format-quote-open +--- +# Cite this Project 💬 + +--8<-- +README.md:CITATION + +README.md:CONTRIBUTORS +--8<-- diff --git a/docs/conf.py b/docs/conf.py deleted file mode 100644 index 7645e78..0000000 --- a/docs/conf.py +++ /dev/null @@ -1,81 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# Configuration file for the Sphinx documentation builder. -# -# For the full list of built-in configuration values, see the documentation: -# https://www.sphinx-doc.org/en/master/usage/configuration.html - -from __future__ import annotations - -import datetime -import importlib.metadata - -project = "TemporalScope" -author = "Philip Ndikum, Serge Ndikum, Kane Norman" -copyright = f"{datetime.datetime.now().year}, {author}" -version = release = importlib.metadata.version("temporalscope") - -extensions = [ - "myst_parser", - "sphinx.ext.autodoc", - "sphinx_autodoc_typehints", - "sphinx.ext.intersphinx", - "sphinx.ext.mathjax", - "sphinx.ext.viewcode", - "sphinx_copybutton", - "autoapi.extension", -] - -autoapi_dirs = ["../src"] -templates_path = ["_templates"] -source_suffix = [".rst", ".md"] -exclude_patterns = [ - "_build", - "**.ipynb_checkpoints", - "Thumbs.db", - ".DS_Store", - ".env", - ".venv", -] - - -master_doc = "index" -autoclass_content = "class" -autosummary_generate = True - -html_theme = "pydata_sphinx_theme" -html_title = project -html_theme_options = { - "icon_links": [ - { - "name": "GitHub", - "url": "https://github.com/philip-ndikum/TemporalScope", - "icon": "fa-brands fa-github", - "type": "fontawesome", - } - ] -} - -myst_enable_extensions = ["colon_fence"] -intersphinx_mapping = {"python": ("https://docs.python.org/3", None)} -nitpick_ignore = [ - ("py:class", "_io.StringIO"), - ("py:class", "_io.BytesIO"), -] - -always_document_param_types = True diff --git a/docs/contributing.md b/docs/contributing.md new file mode 100644 index 0000000..f1f3397 --- /dev/null +++ b/docs/contributing.md @@ -0,0 +1,4 @@ +--- +icon: material/code-block-tags +--- +--8<-- "CONTRIBUTING.md:CONTRIBUTING" diff --git a/docs/css/colors.css b/docs/css/colors.css new file mode 100644 index 0000000..fe1f92e --- /dev/null +++ b/docs/css/colors.css @@ -0,0 +1,24 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +:root { + --md-primary-fg-color: #008080; + --md-primary-fg-color--light: #06C7C7; + --md-primary-fg-color--dark: #025252; +} diff --git a/docs/css/mkdocstrings.css b/docs/css/mkdocstrings.css new file mode 100644 index 0000000..6407f4e --- /dev/null +++ b/docs/css/mkdocstrings.css @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +.doc-symbol-parameter::after { + content: "parameter" !important; +} + +.doc-symbol-attribute::after { + content: "attribute" + !important +} + +.doc-symbol-function::after { + content: "function" !important; +} + +.doc-symbol-method::after { + content: "method" !important; +} + +.doc-symbol-class::after { + content: "class" !important; +} + +.doc-symbol-module::after { + content: "module" !important; +} diff --git a/docs/images/favicon.ico b/docs/images/favicon.ico new file mode 100644 index 0000000..6df297e Binary files /dev/null and b/docs/images/favicon.ico differ diff --git a/assets/temporalscope_github_banner.svg b/docs/images/logo.svg similarity index 85% rename from assets/temporalscope_github_banner.svg rename to docs/images/logo.svg index c6a24f8..d15eba7 100644 --- a/assets/temporalscope_github_banner.svg +++ b/docs/images/logo.svg @@ -1,4 +1,23 @@ + + - :end-before: - -.. toctree:: - :maxdepth: 2 - :caption: Contents: - -Indices and tables -================== - -* :ref:`genindex` -* :ref:`modindex` -* :ref:`search` diff --git a/docs/installation.md b/docs/installation.md new file mode 100644 index 0000000..74b8e11 --- /dev/null +++ b/docs/installation.md @@ -0,0 +1,32 @@ +--- +icon: material/wrench +--- +# Installation 🛠️ + +> WARNING: TemporalScope is currently in **beta**, so some installation methods may not work as expected on all platforms. + + === "pip (recommended)" + + ``` console + $ pip install temporalscope + ``` + + === "uv" + + ``` console + $ uv add temporalscope + ``` + + === "conda" + + ``` console + $ conda install -c conda-forge temporalscope + ``` + + === "source" + + ``` console + $ git clone https://github.com/philip-ndikum/TemporalScope.git + $ cd TemporalScope + $ hatch shell + ``` diff --git a/docs/license_and_legal.md b/docs/license_and_legal.md new file mode 100644 index 0000000..e75bdfa --- /dev/null +++ b/docs/license_and_legal.md @@ -0,0 +1,12 @@ +--- +icon: material/scale-balance +--- +# License, Limitations, and Legal Notice ⚖️ +--8<-- "README.md:LICENSE" + +??? note "Apache License 2.0" + ``` + --8<-- "LICENSE" + ``` + +--8<-- "README.md:LEGAL" diff --git a/docs/make.bat b/docs/make.bat deleted file mode 100644 index 545a312..0000000 --- a/docs/make.bat +++ /dev/null @@ -1,52 +0,0 @@ -rem Licensed to the Apache Software Foundation (ASF) under one -rem or more contributor license agreements. See the NOTICE file -rem distributed with this work for additional information -rem regarding copyright ownership. The ASF licenses this file -rem to you under the Apache License, Version 2.0 (the -rem "License"); you may not use this file except in compliance -rem with the License. You may obtain a copy of the License at -rem -rem http://www.apache.org/licenses/LICENSE-2.0 -rem -rem Unless required by applicable law or agreed to in writing, -rem software distributed under the License is distributed on an -rem "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -rem KIND, either express or implied. See the License for the -rem specific language governing permissions and limitations -rem under the License. - -@ECHO OFF - -pushd %~dp0 - -REM Command file for Sphinx documentation - -if "%SPHINXBUILD%" == "" ( - set SPHINXBUILD=sphinx-build -) -set SOURCEDIR=. -set BUILDDIR=_build - -%SPHINXBUILD% >NUL 2>NUL -if errorlevel 9009 ( - echo. - echo.The 'sphinx-build' command was not found. Make sure you have Sphinx - echo.installed, then set the SPHINXBUILD environment variable to point - echo.to the full path of the 'sphinx-build' executable. Alternatively you - echo.may add the Sphinx directory to PATH. - echo. - echo.If you don't have Sphinx installed, grab it from - echo.https://www.sphinx-doc.org/ - exit /b 1 -) - -if "%1" == "" goto help - -%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% -goto end - -:help -%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% - -:end -popd diff --git a/docs/references.md b/docs/references.md new file mode 100644 index 0000000..1c26388 --- /dev/null +++ b/docs/references.md @@ -0,0 +1,5 @@ +--- +icon: material/book-open-variant +--- +# References and Further Reading 📖 +--8<-- "SCIENTIFIC_LITERATURE.md" diff --git a/docs/usage.md b/docs/usage.md new file mode 100644 index 0000000..6b71ec2 --- /dev/null +++ b/docs/usage.md @@ -0,0 +1,6 @@ +--- +title: Quick Start +icon: material/run-fast +--- +# Quick Start 🚀 +--8<-- "README.md:USAGE" diff --git a/mkdocs.yml b/mkdocs.yml new file mode 100644 index 0000000..7af657f --- /dev/null +++ b/mkdocs.yml @@ -0,0 +1,118 @@ +site_name: "TemporalScope" +site_description: "Scientifically driven Model-Agnostic Temporal Feature Importance Analysis with SHAP & partitioning algorithms (supporting Pandas, Polars, Modin, PyArrow, Dask)." +copyright: "Copyright © Philip Ndikum, Serge Ndikum, Kane Norman 2024-present" + +repo_name: philip-ndikum/TemporalScope +repo_url: https://github.com/philip-ndikum/TemporalScope + +nav: +- Home: + - About: index.md + - Getting Started: + - Installation: installation.md + - Usage: usage.md + - Development: + - Contributing: contributing.md + - Disclaimers: + - License & Legal: license_and_legal.md + - Literature: + - References: references.md + - Cite: citation.md +- API reference: reference/ + +theme: + name: "material" + language: en + favicon: "images/favicon.ico" + icon: + repo: fontawesome/brands/github-alt + font: + text: Roboto + code: Roboto Mono + features: + - navigation.tabs + - navigation.tabs.sticky + - navigation.sections + - navigation.footer + - toc.follow + - navigation.top + - search.suggest + - search.highlight + - search.share + palette: + + - media: "(prefers-color-scheme)" + primary: custom + toggle: + icon: material/brightness-auto + name: Switch to light mode + + - media: "(prefers-color-scheme: light)" + scheme: default + primary: custom + toggle: + icon: material/weather-sunny + name: Switch to dark mode + + - media: "(prefers-color-scheme: dark)" + scheme: slate + primary: custom + toggle: + icon: material/weather-night + name: Switch to system preference + + +plugins: +- search +- gen-files: + scripts: + - scripts/gen_ref_pages.py +- literate-nav: + nav_file: SUMMARY.md +- section-index +- mkdocstrings: + handlers: + python: + paths: [src] + options: + docstring_options: + ignore_init_summary: true + docstring_style: numpy + summary: true + show_bases: false + annotations_path: brief + docstring_section_style: spacy + merge_init_into_class: true + show_if_no_docstring: true + show_symbol_type_heading: true + show_symbol_type_toc: true + show_labels : false + separate_signature: true + show_signature_annotations: true + signature_crossrefs: true + line_length: 60 + +extra: + social: + - icon: fontawesome/brands/github + link: https://github.com/philip-ndikum/TemporalScope + name: TemporalScope on GitHub + +markdown_extensions: + - def_list + - admonition + - callouts: + strip_period: false + - pymdownx.details + - pymdownx.superfences + - pymdownx.tabbed: + alternate_style: true + - pymdownx.snippets + - pymdownx.arithmatex + - pymdownx.emoji + - attr_list + - md_in_html + +extra_css: +- css/mkdocstrings.css +- css/colors.css diff --git a/pyproject.toml b/pyproject.toml index c1830a8..9b74c52 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,80 +7,81 @@ name = "temporalscope" dynamic = ["version"] description = "TemporalScope: Model-Agnostic Temporal Feature Importance Analysis." authors = [ - { name = "Philip Ndikum", email = "philip-ndikum@users.noreply.github.com" }, - { name = "Serge Ndikum" }, - { name = "Kane Norman", email = "kanenorman@fas.harvard.edu" }, + { name = "Philip Ndikum", email = "philip-ndikum@users.noreply.github.com" }, + { name = "Serge Ndikum" }, + { name = "Kane Norman", email = "kanenorman@fas.harvard.edu" }, ] license = "Apache-2.0" readme = "README.md" requires-python = ">=3.10,<3.12" dependencies = [ - # Core Explainability Libraries - "shap>=0.46.0", # SHAP (SHapley Additive exPlanations): For model interpretability and feature importance in machine learning. - "borutashap>=1.0.17", # Boruta-SHAP: Combines SHAP with Boruta for feature selection, helping identify the most important features. - "lime>=0.2.0.1", # LIME (Local Interpretable Model-agnostic Explanations): Enables model-agnostic interpretability, focusing on local, instance-specific explanations. - - # General Machine Learning and Statistics - "scikit-learn>=1.5.1", # Scikit-Learn: Essential machine learning library for classification, regression, and clustering. - "lightgbm>=4.5.0", # LightGBM: Gradient-boosted decision tree library that's efficient for high-dimensional data. - "statsmodels>=0.14.2", # Statsmodels: Provides statistical models and hypothesis tests, useful for traditional time series analysis. - - # Data Manipulation and Storage - "pandas>=1.5.0", # Pandas: Core library for handling structured data in DataFrames. - "modin[all]>=0.31.0", # Modin: Parallelizes Pandas operations for large datasets with Dask/Ray backend. - "pyarrow>=17.0.0", # PyArrow: Enables efficient columnar data format (Arrow) for fast data processing and storage, commonly used with Parquet files. - "polars>=1.5.0", # Polars: High-performance DataFrame library, ideal for handling large datasets in memory. - "dask[dataframe]>=2024.7", # Dask DataFrame: Handles parallelized computations for very large datasets. - - # Specialized Computation - "flax>=0.8.5", # Flax: Neural network library for JAX, useful for building deep learning models. - "jax>=0.4.31", # JAX: Accelerated computation for machine learning research, supporting CPU/GPU/TPU backends. - - # Backend-Agnostic Processing - "narwhals>=0.2.3", # Narwhals: Ensures backend compatibility across Pandas, Polars, Modin, and other dataframes, optimizing performance. - - # Environment Configuration - "python-dotenv>=1.0.1", # Python-Dotenv: Manages environment variables for secure and flexible configuration. - - # Markdown Table Display - "tabulate>=0.9.0", # Tabulate: Allows Markdown-friendly table formatting for DataFrames in Narwhals. + # Core Explainability Libraries + "shap>=0.46.0", # SHAP (SHapley Additive exPlanations): For model interpretability and feature importance in machine learning. + "borutashap>=1.0.17", # Boruta-SHAP: Combines SHAP with Boruta for feature selection, helping identify the most important features. + "lime>=0.2.0.1", # LIME (Local Interpretable Model-agnostic Explanations): Enables model-agnostic interpretability, focusing on local, instance-specific explanations. + + # General Machine Learning and Statistics + "scikit-learn>=1.5.1", # Scikit-Learn: Essential machine learning library for classification, regression, and clustering. + "lightgbm>=4.5.0", # LightGBM: Gradient-boosted decision tree library that's efficient for high-dimensional data. + "statsmodels>=0.14.2", # Statsmodels: Provides statistical models and hypothesis tests, useful for traditional time series analysis. + + # Data Manipulation and Storage + "pandas>=1.5.0", # Pandas: Core library for handling structured data in DataFrames. + "modin[all]>=0.31.0", # Modin: Parallelizes Pandas operations for large datasets with Dask/Ray backend. + "pyarrow>=17.0.0", # PyArrow: Enables efficient columnar data format (Arrow) for fast data processing and storage, commonly used with Parquet files. + "polars>=1.5.0", # Polars: High-performance DataFrame library, ideal for handling large datasets in memory. + "dask[dataframe]>=2024.7", # Dask DataFrame: Handles parallelized computations for very large datasets. + + # Specialized Computation + "flax>=0.8.5", # Flax: Neural network library for JAX, useful for building deep learning models. + "jax>=0.4.31", # JAX: Accelerated computation for machine learning research, supporting CPU/GPU/TPU backends. + + # Backend-Agnostic Processing + "narwhals>=0.2.3", # Narwhals: Ensures backend compatibility across Pandas, Polars, Modin, and other dataframes, optimizing performance. + + # Environment Configuration + "python-dotenv>=1.0.1", # Python-Dotenv: Manages environment variables for secure and flexible configuration. + + # Markdown Table Display + "tabulate>=0.9.0", # Tabulate: Allows Markdown-friendly table formatting for DataFrames in Narwhals. ] classifiers = [ - "Development Status :: 2 - Pre-Alpha", - "Intended Audience :: Science/Research", - "License :: OSI Approved :: Apache Software License", - "Operating System :: OS Independent", - "Programming Language :: Python", - "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.11", - "Programming Language :: Python :: 3.12", - "Topic :: Scientific/Engineering :: Artificial Intelligence", + "Development Status :: 2 - Pre-Alpha", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: Apache Software License", + "Operating System :: OS Independent", + "Programming Language :: Python", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Topic :: Scientific/Engineering :: Artificial Intelligence", ] keywords = [ - "Shap", - "Borutashap", - "Feature-Importance", - "Temporal-Feature-Importance", - "Temporal-Feature-Analysis", - "XAI", - "ML", - "AI", - "Machine-Learning", - "Artificial-Intelligence", - "TemporalScope", - "Time-Series", + "Shap", + "Borutashap", + "Feature-Importance", + "Temporal-Feature-Importance", + "Temporal-Feature-Analysis", + "XAI", + "ML", + "AI", + "Machine-Learning", + "Artificial-Intelligence", + "TemporalScope", + "Time-Series", ] [project.optional-dependencies] docs = [ - "pydata-sphinx-theme", - "myst-parser", - "sphinx >=4.0", - "sphinx-copybutton", - "sphinx-autodoc-typehints", - "sphinx-autodoc-annotation", - "sphinx-autoapi", + "mkdocs", + "mkdocstrings-python", + "mkdocs-material", + "mkdocs-gen-files", + "mkdocs-literate-nav", + "mkdocs-section-index", + "markdown-callouts", + "black", ] [project.urls] @@ -88,36 +89,35 @@ docs = [ Documentation = "https://temporalscope.readthedocs.io/en/latest/" [tool.hatch.envs.default] -dependencies = ["pre-commit", "ruff", "jupyterlab", "notebook", "commitizen", "types-tabulate>=0.9.0"] +dependencies = [ + "pre-commit", + "ruff", + "jupyterlab", + "notebook", + "commitizen", + "types-tabulate>=0.9.0", +] [tool.hatch.envs.docs] features = ["docs"] [tool.hatch.envs.test] extra-dependencies = [ - "pytest", - "pytest-cov", - "pytest-custom_exit_code", - "pytest-mock", - "papermill>=2.5.0", + "pytest", + "pytest-cov", + "pytest-custom_exit_code", + "pytest-mock", + "papermill>=2.5.0", ] [tool.hatch.envs.docs.scripts] -build = "sphinx-build -WTb html . _build" -serve = "python -m http.server --directory _build" +build = "mkdocs build --strict" +serve = "mkdocs serve --strict" [tool.hatch.envs.test.scripts] unit = 'pytest --cov-report=lcov --cov-report=term --cov="temporalscope" -m "not integration and not notebook" {args:test}' notebook = 'pytest -m "notebook" {args:test}' integration = 'pytest -m "integration" {args:test}' -quality-assurance = """ -pytest && -docformatter --check --recursive --wrap-summaries 120 --wrap-descriptions 120 src/temporalscope || \ -docformatter --in-place --recursive --wrap-summaries 120 --wrap-descriptions 120 src/temporalscope && -ruff check src/temporalscope --output-format=full --show-files --show-fixes && -mypy src/temporalscope --ignore-missing-imports --show-error-codes --warn-unreachable && -bandit -r src/temporalscope -""" [tool.pytest.ini_options] testpaths = ["tests"] @@ -129,11 +129,13 @@ log_date_format = "%Y-%m-%d %H:%M:%S" minversion = "6.0" filterwarnings = "ignore" markers = [ - "notebook: marks tests that verify notebook execution", - "integration: marks tests that require external resources", + "notebook: marks tests that verify notebook execution", + "integration: marks tests that require external resources", ] [tool.ruff.format] +quote-style = "double" +indent-style = "space" docstring-code-format = true [tool.ruff] @@ -143,53 +145,57 @@ line-length = 120 [tool.ruff.lint] select = [ - "C4", # flake8-comprehensions - "C90", # McCabe cyclomatic complexity - "E501", # Long lines - "EXE", # flake8-executable - "F", # Pyflakes - "INT", # flake8-gettext - "PERF", # Perflint - "PL", # Pylint - "Q", # flake8-quotes - "SIM", # flake8-simplify - "SLOT", # flake8-slots - "T10", # flake8-debugger - "W", # pycodestyle - "YTT", # flake8-2020 - "I", # isort - # built-in shadowing - "A001", # builtin-variable-shadowing - "A002", # builtin-argument-shadowing - "A003", # builtin-attribute-shadowing - # docstring rules - "D", # flake8-docstrings + "A001", # builtin-variable-shadowing + "A002", # builtin-argument-shadowing + "A003", # builtin-attribute-shadowing + "C4", # flake8-comprehensions + "C90", # McCabe cyclomatic complexity + "D", # flake8-docstrings + "E501", # Long lines + "EXE", # flake8-executable + "F", # Pyflakes + "I", # isort + "INT", # flake8-gettext + "PERF", # Perflint + "PL", # Pylint + "Q", # flake8-quotes + "SIM", # flake8-simplify + "SLOT", # flake8-slots + "T10", # flake8-debugger + "W", # pycodestyle + "YTT", # flake8-2020 ] ignore = [ - "D400", # Ignore "First line should end with a period" for docstrings. - "D401", # Ignore "First line should be in imperative mood" for docstrings. - "D415", # Ignore "First line should end with a period, question mark, or exclamation point." - "E501", # Ignore "Line too long" in docstrings/comments for exceeding 120 characters. - "PERF203", # `try`-`except` within a loop incurs performance overhead - "PERF401", # Use a list comprehension to create a transformed list - "PLR1714", # repeated-equality-comparison - "PLR5501", # collapsible-else-if - "PLW2901", # redefined-loop-name - "SIM108", # if-else-block-instead-of-if-exp - "PLR0913", # too many arguments - "SIM102", # temporary - "C901", # temporary - "D100", # Missing docstring in public module - "D104", # Missing docstring in public package - "D211", # No blank lines allowed before class docstring - "D213", # Multi-line docstring summary should start at the second line - "D203", # 1 blank line required before class docstring + "C901", # temporary + "D100", # Missing docstring in public module + "D104", # Missing docstring in public package + "D203", # 1 blank line required before class docstring + "D211", # No blank lines allowed before class docstring + "D213", # Multi-line docstring summary should start at the second line + "D400", # Ignore "First line should end with a period" for docstrings. + "D401", # Ignore "First line should be in imperative mood" for docstrings. + "D415", # Ignore "First line should end with a period, question mark, or exclamation point." + "E501", # Ignore "Line too long" in docstrings/comments for exceeding 120 characters. + "PERF203", # `try`-`except` within a loop incurs performance overhead + "PERF401", # Use a list comprehension to create a transformed list + "PLR0913", # too many arguments + "PLR1714", # repeated-equality-comparison + "PLR5501", # collapsible-else-if + "PLW2901", # redefined-loop-name + "SIM102", # temporary + "SIM108", # if-else-block-instead-of-if-exp ] +[tool.ruff.lint.pydocstyle] +convention = "numpy" + [tool.ruff.lint.per-file-ignores] -"docs/conf.py" = ["A001", "D103"] -"test/*" = ["PLR2004"] # Ignore magic number warnings in test files +"docs/conf.py" = [ + "A001", # builtin-variable-shadowing + "D103", # missing docstring in public function +] +"test/*" = ["PLR2004"] # Ignore magic number warnings in test files [tool.mypy] files = "src/temporalscope" @@ -197,7 +203,7 @@ python_version = "3.10" ignore_missing_imports = true warn_unreachable = true exclude = 'test/*' -warn_return_any = false # Turn off MyPy warnings for missing return types +warn_return_any = false # Turn off MyPy warnings for missing return types [tool.bandit] exclude_dirs = ["test"] @@ -217,10 +223,6 @@ check = "ruff check {args}" fix = "ruff check --fix" format = "ruff format {args}" format-check = "ruff format --check {args}" -docformat = """ -docformatter --check --recursive --wrap-summaries 120 --wrap-descriptions 120 src/temporalscope || \ -docformatter --in-place --recursive --wrap-summaries 120 --wrap-descriptions 120 src/temporalscope -""" clear-coverage = "coverage erase" generate-kernel = """ python -m ipykernel install --user --name temporalscope-kernel --display-name "TemporalScope" diff --git a/scripts/gen_ref_pages.py b/scripts/gen_ref_pages.py new file mode 100644 index 0000000..7f6205d --- /dev/null +++ b/scripts/gen_ref_pages.py @@ -0,0 +1,52 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Generate the code reference pages and navigation.""" + +from pathlib import Path + +import mkdocs_gen_files + +nav = mkdocs_gen_files.Nav() + +root = Path(__file__).parent.parent +src = root / "src" + +for path in sorted(src.rglob("*.py")): + module_path = path.relative_to(src).with_suffix("") + doc_path = path.relative_to(src).with_suffix(".md") + full_doc_path = Path("reference", doc_path) + + parts = tuple(module_path.parts) + + if parts[-1] == "__init__": + parts = parts[:-1] + doc_path = doc_path.with_name("index.md") + full_doc_path = full_doc_path.with_name("index.md") + elif parts[-1] == "__main__": + continue + + nav[parts] = doc_path.as_posix() + + with mkdocs_gen_files.open(full_doc_path, "w") as fd: + ident = ".".join(parts) + fd.write(f"::: {ident}") + + mkdocs_gen_files.set_edit_path(full_doc_path, path.relative_to(root)) + +with mkdocs_gen_files.open("reference/SUMMARY.md", "w") as nav_file: + nav_file.writelines(nav.build_literate_nav()) diff --git a/src/temporalscope/core/core_utils.py b/src/temporalscope/core/core_utils.py index 5640b1c..eb58f68 100644 --- a/src/temporalscope/core/core_utils.py +++ b/src/temporalscope/core/core_utils.py @@ -19,11 +19,11 @@ This module provides essential utility functions for the TemporalScope package, including support for: -- Backend validation (Narwhals). -- Checking for nulls, NaNs, and handling mixed frequency issues in time series - data. -- Managing different modes (Single-step vs. Multi-step) for machine learning and - deep learning workflows. + - Backend validation (Narwhals). + - Checking for nulls, NaNs, and handling mixed frequency issues in time series + data. + - Managing different modes (Single-step vs. Multi-step) for machine learning and + deep learning workflows. Type safety and validation follow Linux Foundation security standards to ensure robust backend interoperability. @@ -69,22 +69,11 @@ to customize their workflows for specific model-building tasks (e.g., tree-based models, neural networks, etc.): -+----------------+-------------------------------------------------------------------+ -| Mode | Description | -| | Data Structure | -+----------------+-------------------------------------------------------------------+ -| single_step | General machine learning tasks with scalar targets. Each row is | -| | a single time step, and the target is scalar. | -| | Single DataFrame: each row is an observation. | - :raises ValueError: If validation fails and `raise_error` is True. -+----------------+-------------------------------------------------------------------+ -| multi_step | Sequential time series tasks (e.g., seq2seq) for deep learning. | -| | The data is split into sequences (input X, target Y). | -| | Two DataFrames: X for input sequences, Y for targets. | -| | Frameworks: TensorFlow, PyTorch, Keras. | -+----------------+-------------------------------------------------------------------+ - -.. seealso:: + +| Mode | Description | +|------|-------------| +| single_step | General machine learning tasks with scalar targets. Each row is a single time step, and the target is scalar. Single DataFrame: each row is an observation. | +| multi_step | Sequential time series tasks (e.g., seq2seq) for deep learning. The data is split into sequences (input X, target Y). Two DataFrames: X for input sequences, Y for targets. Frameworks: TensorFlow, PyTorch, Keras. | Example Visualization: ---------------------- @@ -92,90 +81,52 @@ modes, including the shape of input (`X`) and target (`Y`) data compatible with most popular ML frameworks like TensorFlow, PyTorch, and SHAP. -Single-step mode: - +------------+------------+------------+------------+-----------+ - | time | feature_1 | feature_2 | feature_3 | target | - +============+============+============+============+===========+ - | 2023-01-01 | 0.15 | 0.67 | 0.89 | 0.33 | - +------------+------------+------------+------------+-----------+ - | 2023-01-02 | 0.24 | 0.41 | 0.92 | 0.28 | - +------------+------------+------------+------------+-----------+ - - Shape: - - `X`: (num_samples, num_features) - - `Y`: (num_samples, 1) # Scalar target for each time step - -Multi-step mode (with vectorized targets): - - +------------+------------+------------+------------+-------------+ - | time | feature_1 | feature_2 | feature_3 | target | - +============+============+============+============+=============+ - | 2023-01-01 | 0.15 | 0.67 | 0.89 | [0.3, 0.4] | - +------------+------------+------------+------------+-------------+ - | 2023-01-02 | 0.24 | 0.41 | 0.92 | [0.5, 0.6] | - +------------+------------+------------+------------+-------------+ - - Shape: - - `X`: (num_samples, num_features) - - `Y`: (num_samples, sequence_length) # Vectorized target for each input sequence +**Single-step mode**: + +- Input shape: X (num_samples, num_features) +- Target shape: Y (num_samples, 1) # Scalar target for each time step + +| time | feature_1 | feature_2 | feature_3 | target | +|------------|-----------|-----------|-----------|--------| +| 2023-01-01 | 0.15 | 0.67 | 0.89 | 0.33 | +| 2023-01-02 | 0.24 | 0.41 | 0.92 | 0.28 | + +**Multi-step mode (with vectorized targets)**: + +- Input shape: X (num_samples, num_features) +- Target shape: Y (num_samples, sequence_length) # Vectorized target for each input sequence + + +| time | feature_1 | feature_2 | feature_3 | target | +|------------|-----------|-----------|-----------|--------------| +| 2023-01-01 | 0.15 | 0.67 | 0.89 | [0.3, 0.4] | +| 2023-01-02 | 0.24 | 0.41 | 0.92 | [0.5, 0.6] | DataFrame Types: ---------------- TemporalScope handles various DataFrame types throughout the data processing pipeline. The following table illustrates the supported DataFrame types and validation cases: -+------------------------+-------------------------------------------------------+---------------------------+ -| DataFrame Type | Description | Example | -+------------------------+-------------------------------------------------------+---------------------------+ -| Narwhalified | DataFrames wrapped by Narwhals for backend-agnostic | @nw.narwhalify decorated | -| (FrameT) | operations. These are validated first to ensure | functions create these | -| | consistent handling across backends. | during operations. | -+------------------------+-------------------------------------------------------+---------------------------+ -| Native DataFrames | Core DataFrame implementations from supported | pd.DataFrame, | -| | backends. These are validated directly against | pl.DataFrame, | -| | TEMPORALSCOPE_CORE_BACKEND_TYPES. | pa.Table | -+------------------------+-------------------------------------------------------+---------------------------+ -| DataFrame Subclasses | Custom or specialized DataFrames that inherit from | TimeSeriesDataFrame | -| | native types. Common in: | (pd.DataFrame), | -| | - Custom DataFrame implementations | dask.dataframe.DataFrame | -| | - Backend optimizations (e.g. lazy evaluation) | (inherits from pandas) | -| | - Backend compatibility layers | | -+------------------------+-------------------------------------------------------+---------------------------+ -| Intermediate States | DataFrames in the middle of narwhalify operations | LazyDataFrame during | -| | or backend conversions. These may be temporary | backend conversion | -| | subclasses used for optimization or compatibility. | operation chaining | -+------------------------+-------------------------------------------------------+---------------------------+ - -.. note:: - +| DataFrame Type | Description | Example | +|---------------|--------------|---------| +| Narwhalified (FrameT) | DataFrames wrapped by Narwhals for backend-agnostic operations. These are validated first to ensure consistent handling across backends. | @nw.narwhalify decorated functions create these during operations. | +| Native DataFrames | Core DataFrame implementations from supported backends. These are validated directly against TEMPORALSCOPE_CORE_BACKEND_TYPES. | pd.DataFrame, pl.DataFrame, pa.Table | +| DataFrame Subclasses | Custom or specialized DataFrames that inherit from native types. Common in: - Custom DataFrame implementations - Backend optimizations (e.g. lazy evaluation) - Backend compatibility layers | TimeSeriesDataFrame (pd.DataFrame), dask.dataframe.DataFrame (inherits from pandas) | +| Intermediate States | DataFrames in the middle of narwhalify operations or backend conversions. These may be temporary subclasses used for optimization or compatibility. | LazyDataFrame during backend conversion operation chaining | Naming Conventions: ----------------- +------------------- The following naming conventions are used for utility functions in this module: -+--------------------------+------------------------------------------------------------+ -| Pattern | Purpose | -+--------------------------+------------------------------------------------------------+ -| `validate_` | Checks that an object meets specific requirements and | -| | raises an error if it doesn't. | -+--------------------------+------------------------------------------------------------+ -| `is_` | Returns metadata about an object (Boolean or other). | -+--------------------------+------------------------------------------------------------+ -| `convert_` | Transforms an object to a desired type or structure. | -+--------------------------+------------------------------------------------------------+ -| `check_` | Performs a specific check and returns a result or raises | -| | an error if the condition is violated. | -+--------------------------+------------------------------------------------------------+ -| `sort_` | Orders an object based on specified criteria. | -+--------------------------+------------------------------------------------------------+ - -- Narwhals functions always return the native format of the backend (e.g., Pandas, - Polars, Modin) after execution. This behavior ensures backend-agnostic compatibility. -- Use `nw.from_native` when chaining `@nw.narwhalify` functions to maintain compatibility - with backend-agnostic workflows, as each function outputs a native DataFrame format. -- The `@nw.narwhalify` decorator automatically manages backend detection and format - conversion at the entry and exit points of the function. + +| Pattern | Purpose | +|---------|---------| +| `validate_` | Checks that an object meets specific requirements and raises an error if it doesn't. | +| `is_` | Returns metadata about an object (Boolean or other). | +| `convert_` | Transforms an object to a desired type or structure. | +| `check_` | Performs a specific check and returns a result or raises an error if the condition is violated. | +| `sort_` | Orders an object based on specified criteria. | + """ -# Standard Library Imports import os import warnings from importlib import util @@ -183,21 +134,14 @@ import dask.dataframe as dd import modin.pandas as mpd - -# Narwhals Imports import narwhals as nw - -# Third-Party Imports import pandas as pd import polars as pl import pyarrow as pa - -# Loading Environment variables from dotenv import load_dotenv from narwhals.typing import FrameT, IntoDataFrame from narwhals.utils import Implementation -# TemporalScope Imports from temporalscope.core.exceptions import TimeColumnError, UnsupportedBackendError # Load environment variables from the .env file @@ -260,21 +204,27 @@ def get_api_keys() -> Dict[str, Optional[str]]: """Retrieve API keys from environment variables. - :return: A dictionary containing the API keys, or None if not found. - :rtype: Dict[str, Optional[str]] - - Example Usage: - -------------- - .. code-block:: python - - # Assume environment variables are set: - # export OPENAI_API_KEY='abc123' - # export CLAUDE_API_KEY='def456' - - # Retrieve API keys - api_keys = get_api_keys() - print(api_keys) - # Output: {'OPENAI_API_KEY': 'abc123', 'CLAUDE_API_KEY': 'def456'} + Returns + ------- + Dict[str, Optional[str]] + + Examples + -------- + ```python + # Assume environment variables are set: + # export OPENAI_API_KEY='abc123' + # export CLAUDE_API_KEY='def456' + + # Retrieve API keys + api_keys = get_api_keys() + print(api_keys) + # Output: {'OPENAI_API_KEY': 'abc123', 'CLAUDE_API_KEY': 'def456'} + ``` + + Returns + ------- + Dict[str, Optional[str]] + A dictionary containing the API keys, or None if not found. """ api_keys = { @@ -290,17 +240,22 @@ def get_api_keys() -> Dict[str, Optional[str]]: def print_divider(char: str = "=", length: int = 70) -> None: """Prints a divider line made of a specified character and length. - :param char: The character to use for the divider, defaults to '=' - :type char: str, optional - :param length: The length of the divider, defaults to 70 - :type length: int, optional + Parameters + ---------- + char : str, optional + The character to use for the divider, defaults to '=' + length : int, optional - Example: - ------- - .. code-block:: python + Examples + -------- + ```python + print_divider(char="-", length=50) + # Output: -------------------------------------------------- + ``` - print_divider(char="-", length=50) - # Output: -------------------------------------------------- + Returns + ------- + None """ print(char * length) @@ -314,16 +269,22 @@ def print_divider(char: str = "=", length: int = 70) -> None: def get_narwhals_backends() -> List[str]: """Retrieve all backends available through Narwhals. - :return: List of Narwhals-supported backend names in lowercase. - :rtype: List[str] + Returns + ------- + List[str] - Example Usage: - -------------- - .. code-block:: python + Examples + -------- + ```python + backends = get_narwhals_backends() + print(backends) + # Output: ['pandas', 'modin', 'pyarrow', 'polars', 'dask'] + ``` - backends = get_narwhals_backends() - print(backends) - # Output: ['pandas', 'modin', 'pyarrow', 'polars', 'dask'] + Returns + ------- + List[str] + List of Narwhals-supported backend names. """ return [backend.name.lower() for backend in Implementation] @@ -332,17 +293,23 @@ def get_narwhals_backends() -> List[str]: def get_default_backend_cfg() -> Dict[str, List[str]]: """Retrieve the default application configuration for available backends. - :return: Dictionary with a single key 'BACKENDS' containing a list of all - Narwhals-supported backends. - :rtype: Dict[str, List[str]] - - Example: + Returns ------- - .. code-block:: python + Dict[str, List[str]] - config = get_default_backend_cfg() - print(config) - # Output: {'BACKENDS': ['pandas', 'modin', 'pyarrow', 'polars', 'dask']} + Examples + -------- + ```python + config = get_default_backend_cfg() + print(config) + # Output: {'BACKENDS': ['pandas', 'modin', 'pyarrow', 'polars', 'dask']} + ``` + + Returns + ------- + Dict[str, List[str]] + Dictiona a single key 'BACKENDS' containing a list of all + Narwhals-supported backends. """ available_backends = get_narwhals_backends() @@ -352,16 +319,22 @@ def get_default_backend_cfg() -> Dict[str, List[str]]: def get_temporalscope_backends() -> List[str]: """Retrieve the subset of Narwhals-supported backends compatible with TemporalScope. - :return: List of backend names supported by TemporalScope. - :rtype: List[str] + Returns + ------- + List[str] - Example Usage: - -------------- - .. code-block:: python + Examples + -------- + ```python + backends = get_temporalscope_backends() + print(backends) + # Output: ['pandas', 'modin', 'pyarrow', 'polars', 'dask'] + ``` - backends = get_temporalscope_backends() - print(backends) - # Output: ['pandas', 'modin', 'pyarrow', 'polars', 'dask'] + Returns + ------- + List[str] + List of backend names supported by TemporalScope. """ available_backends = get_narwhals_backends() @@ -371,24 +344,37 @@ def get_temporalscope_backends() -> List[str]: def is_valid_temporal_backend(backend_name: str) -> None: """Validate that a backend is supported by TemporalScope and Narwhals. - :param backend_name: Name of the backend to validate. - :type backend_name: str - :raises UnsupportedBackendError: If the backend is not in supported or optional backends. - :raises UserWarning: If the backend is in the optional set but not installed. - - Example Usage: - -------------- - .. code-block:: python + Parameters + ---------- + backend_name : str + Name of the backend to validate. + backend_name: str : - # Validate a valid backend - is_valid_temporal_backend("pandas") # Passes silently - # Validate an unsupported backend - try: - is_valid_temporal_backend("unsupported_backend") - except UnsupportedBackendError as e: - print(e) - # Output: Backend 'unsupported_backend' is not supported by TemporalScope. + Returns + ------- + None + + Raises + ------ + UnsupportedBackendError + If the backend is not in supported or optional backends. + UserWarning + If the backend is in the optional set but not installed. + + Examples + -------- + ```python + # Validate a valid backend + is_valid_temporal_backend("pandas") # Passes silently + + # Validate an unsupported backend + try: + is_valid_temporal_backend("unsupported_backend") + except UnsupportedBackendError as e: + print(e) + # Output: Backend 'unsupported_backend' is not supported by TemporalScope. + ``` """ # Assume TEMPORALSCOPE_CORE_BACKENDS and TEMPORALSCOPE_OPTIONAL_BACKENDS are sets @@ -417,27 +403,37 @@ def is_valid_temporal_dataframe(df: Union[SupportedTemporalDataFrame, Any]) -> T Uses TEMPORALSCOPE_CORE_BACKEND_TYPES to validate actual DataFrame instances. Handles both native DataFrame types and narwhalified (FrameT) DataFrames. - :param df: Object to validate, can be any supported DataFrame type or arbitrary object. - :type df: Union[SupportedTemporalDataFrame, Any] - :return: Tuple of (is_valid, df_type) where df_type is: - - "narwhals" for FrameT DataFrames - - "native" for supported DataFrame types - - None if not valid - :rtype: Tuple[bool, Optional[str]] + Parameters + ---------- + df : Union[SupportedTemporalDataFrame, Any] + Object to validate, can be any supported DataFrame type or arbitrary object. + df: Union[SupportedTemporalDataFrame, Any] - Example: + Returns ------- - .. code-block:: python - - df = pd.DataFrame({"time": [1, 2, 3], "value": [10, 20, 30]}) - result = is_valid_temporal_dataframe(df) - print(result) - # Output: (True, "native") - - invalid_df = "Not a DataFrame" - result = is_valid_temporal_dataframe(invalid_df) - print(result) - # Output: (False, None) + Tuple[bool, Optional[str]] + + Examples + -------- + ```python + df = pd.DataFrame({"time": [1, 2, 3], "value": [10, 20, 30]}) + result = is_valid_temporal_dataframe(df) + print(result) + # Output: (True, "native") + + invalid_df = "Not a DataFrame" + result = is_valid_temporal_dataframe(invalid_df) + print(result) + # Output: (False, None) + ``` + + Returns + ------- + Tuple[bool, Optional[str]] + Tuple of (is_valid, df_type) where df_type is: + - "narwhals" for FrameT DataFrames + - "native" for supported DataFrame types + - None if not valid """ try: @@ -470,30 +466,39 @@ def is_valid_temporal_dataframe(df: Union[SupportedTemporalDataFrame, Any]) -> T def get_dataframe_backend(df: Union[SupportedTemporalDataFrame, Any]) -> str: """Get the backend name for a DataFrame. - :param df: DataFrame to get backend for. - :type df: Union[SupportedTemporalDataFrame, Any] - :return: Backend name ('pandas', 'modin', 'polars', 'pyarrow', 'dask'). - :rtype: str - :raises UnsupportedBackendError: If DataFrame type not supported. + Parameters + ---------- + df : Union[SupportedTemporalDataFrame, Any] + DataFrame to get backend for. + + Returns + ------- + str + Backend name ('pandas', 'modin', 'polars', 'pyarrow', 'dask'). - Example Usage: - -------------- - .. code-block:: python + Raises + ------ + UnsupportedBackendError + If DataFrame type not supported. - from temporalscope.core.core_utils import get_dataframe_backend - import pandas as pd + Examples + -------- + ```python + from temporalscope.core.core_utils import get_dataframe_backend + import pandas as pd - # Example with a Pandas DataFrame - df = pd.DataFrame({"col1": [1, 2, 3]}) - backend = get_dataframe_backend(df) - print(backend) # Output: 'pandas' + # Example with a Pandas DataFrame + df = pd.DataFrame({"col1": [1, 2, 3]}) + backend = get_dataframe_backend(df) + print(backend) # Output: 'pandas' - # Example with a Polars DataFrame - import polars as pl + # Example with a Polars DataFrame + import polars as pl - df = pl.DataFrame({"col1": [1, 2, 3]}) - backend = get_dataframe_backend(df) - print(backend) # Output: 'polars' + df = pl.DataFrame({"col1": [1, 2, 3]}) + backend = get_dataframe_backend(df) + print(backend) # Output: 'polars' + ``` """ # First validate DataFrame type @@ -520,31 +525,42 @@ def get_dataframe_backend(df: Union[SupportedTemporalDataFrame, Any]) -> str: def is_lazy_evaluation(df: SupportedTemporalDataFrame) -> bool: """Check if a DataFrame uses lazy evaluation. - :param df: The DataFrame to check for evaluation mode. - :type df: SupportedTemporalDataFrame - :return: True if the DataFrame uses lazy evaluation, False otherwise. - :rtype: bool - :raises UnsupportedBackendError: If the DataFrame's backend is not supported by TemporalScope. - - Example Usage: - -------------- - .. code-block:: python - - from temporalscope.core.core_utils import is_lazy_evaluation - - # Check evaluation mode - if is_lazy_evaluation(df): - # Lazy evaluation path - result = df.select([...]) # Maintain lazy evaluation - else: - # Eager evaluation path - result = df.select([...]) # Direct computation is safe - - .. note:: - This function determines whether a DataFrame uses lazy or eager evaluation: - - Lazy execution, such as with Dask or Polars in lazy mode. - - Eager execution, such as with Pandas or Polars in eager mode. - This distinction is important for maintaining consistent evaluation modes during computations. + Parameters + ---------- + df : SupportedTemporalDataFrame + The DataFrame to check for evaluation mode. + + Returns + ------- + bool + True if the DataFrame uses lazy evaluation, False otherwise. + + Raises + ------ + UnsupportedBackendError + If the DataFrame's backend is not supported by TemporalScope. + + Examples + -------- + ```python + from temporalscope.core.core_utils import is_lazy_evaluation + + # Check evaluation mode + if is_lazy_evaluation(df): + # Lazy evaluation path + result = df.select([...]) # Maintain lazy evaluation + else: + # Eager evaluation path + result = df.select([...]) # Direct computation is safe + ``` + + Notes + ----- + This function determines whether a DataFrame uses lazy or eager evaluation: + - Lazy execution, such as with Dask or Polars in lazy mode. + - Eager execution, such as with Pandas or Polars in eager mode. + - This distinction is important for maintaining consistent evaluation modes during computations. + """ # Validate the input DataFrame is_valid, _ = is_valid_temporal_dataframe(df) @@ -568,35 +584,52 @@ def convert_to_backend( Narwhals handles validation and lazy evaluation, while `backend_converter_dict` manages conversion. - :param df: Input DataFrame (pandas, modin, polars, pyarrow, dask). - :type df: Union[SupportedTemporalDataFrame, IntoDataFrame] - :param backend: Target backend ('pandas', 'modin', 'polars', 'pyarrow', 'dask'). - :type backend: str - :param npartitions: Number of partitions for Dask backend. Default is 1. - :type npartitions: int - :param backend_converter_dict: Backend conversion functions. - :type backend_converter_dict: Dict[str, Callable[[pd.DataFrame, int], Any]], optional - :return: Converted DataFrame in the target backend. - :rtype: SupportedTemporalDataFrame - :raises UnsupportedBackendError: If the backend or DataFrame is unsupported. - - Example Usage: - -------------- - .. code-block:: python - - # Pandas -> Polars conversion - df_pd = pd.DataFrame({"time": range(10), "value": range(10)}) - df_polars = convert_to_backend(df_pd, backend="polars") - - # Dask -> Pandas materialization - df_dask = dd.from_pandas(df_pd, npartitions=2) - df_pd_result = convert_to_backend(df_dask, backend="pandas") - - .. note:: - Steps: - - Validate: Narwhals checks input compatibility (`is_valid_temporal_dataframe`). - - Materialize: Handles lazy evaluation (Dask/Polars LazyFrames). - - Convert: Uses `backend_converter_dict` for backend-specific transformations. + Parameters + ---------- + df : Union[SupportedTemporalDataFrame, IntoDataFrame] + Input DataFrame (pandas, modin, polars, pyarrow, dask). + backend : str + Target backend ('pandas', 'modin', 'polars', 'pyarrow', 'dask'). + npartitions : int + Number of partitions for Dask backend. Default is 1. + backend_converter_dict : Dict[str, Callable[[pd.DataFrame, int], Any]], optional + Backend conversion functions. + df: Union[SupportedTemporalDataFrame, IntoDataFrame] + backend: str + npartitions: int : + (Default value = 1) + backend_converter_dict: Dict[str, Callable[[pd.DataFrame, int, Any]] : + (Default value = TEMPORALSCOPE_BACKEND_CONVERTERS) + + Returns + ------- + SupportedTemporalDataFrame + Converted DataFrame in the target backend. + + Raises + ------ + UnsupportedBackendError + If the backend or DataFrame is unsupported. + + Examples + -------- + ```python + # Pandas -> Polars conversion + df_pd = pd.DataFrame({"time": range(10), "value": range(10)}) + df_polars = convert_to_backend(df_pd, backend="polars") + + # Dask -> Pandas materialization + df_dask = dd.from_pandas(df_pd, npartitions=2) + df_pd_result = convert_to_backend(df_dask, backend="pandas") + ``` + + Notes + ----- + Steps: + - Validate: Narwhals checks input compatibility (`is_valid_temporal_dataframe`). + - Materialize: Handles lazy evaluation (Dask/Polars LazyFrames). + - Convert: Uses `backend_converter_dict` for backend-specific transformations. + """ # Validate backend and DataFrame using helper functions is_valid_temporal_backend(backend) @@ -648,34 +681,47 @@ def check_dataframe_empty(df: SupportedTemporalDataFrame) -> bool: determines whether it is empty based on standard backend attributes such as `shape`. It handles lazy evaluation transparently for backends like Dask and Polars. - :param df: The input DataFrame to check. - :type df: SupportedTemporalDataFrame - :return: True if the DataFrame is empty, False otherwise. - :rtype: bool - :raises ValueError: If the input DataFrame is None or invalid. - :raises UnsupportedBackendError: If the backend is not supported by TemporalScope. + Parameters + ---------- + df : SupportedTemporalDataFrame + The input DataFrame to check. + df: SupportedTemporalDataFrame : - Example Usage: - -------------- - .. code-block:: python - from temporalscope.core.core_utils import check_dataframe_empty - - # Example with Pandas DataFrame - import pandas as pd - - df = pd.DataFrame(columns=["col1"]) - assert check_dataframe_empty(df) == True - - # Example with lazy-evaluation backends - # Assumes `df` is a lazy DataFrame (e.g., Dask or Polars) - assert check_dataframe_empty(df) == True - - .. note:: - This function checks for emptiness using attributes like `shape`, `__len__`, - and `num_rows` to support various backends. These attributes cover common - DataFrame implementations, ensuring robust handling across the Narwhals API. - If none of these attributes are present, an `UnsupportedBackendError` is raised. + Returns + ------- + bool + True if the DataFrame is empty, False otherwise. + + Raises + ------ + ValueError + If the input DataFrame is None or invalid. + UnsupportedBackendError + If the backend is not supported by TemporalScope. + + Examples + -------- + ```python + from temporalscope.core.core_utils import check_dataframe_empty + + # Example with Pandas DataFrame + import pandas as pd + + df = pd.DataFrame(columns=["col1"]) + assert check_dataframe_empty(df) == True + + # Example with lazy-evaluation backends + # Assumes `df` is a lazy DataFrame (e.g., Dask or Polars) + assert check_dataframe_empty(df) == True + ``` + + Notes + ----- + This function checks for emptiness using attributes like `shape`, `__len__`, + and `num_rows` to support various backends. These attributes cover common + DataFrame implementations, ensuring robust handling across the Narwhals API. + If none of these attributes are present, an `UnsupportedBackendError` is raised. """ if df is None: @@ -711,44 +757,59 @@ def check_dataframe_nulls_nans(df: SupportedTemporalDataFrame, column_names: Lis This function first validates if the DataFrame is empty using `check_dataframe_empty` and then performs backend-agnostic null value counting for the specified columns. - :param df: DataFrame to check for null values. - :type df: SupportedTemporalDataFrame - :param column_names: List of column names to check. - :type column_names: List[str] - :return: Dictionary mapping column names to their null value counts. - :rtype: Dict[str, int] - :raises ValueError: If the DataFrame is empty or a column is nonexistent. - :raises UnsupportedBackendError: If the backend is unsupported. - - Example Usage: - -------------- - .. code-block:: python - - from temporalscope.core.core_utils import check_dataframe_nulls_nans - - # Example input DataFrame - import pandas as pd - - df = pd.DataFrame( - { - "col1": [1, 2, None], - "col2": [4, None, 6], - } - ) - - # Define columns to check - column_names = ["col1", "col2"] + Parameters + ---------- + df : SupportedTemporalDataFrame + DataFrame to check for null values. + column_names : List[str] + List of column names to check. + df: SupportedTemporalDataFrame : - # Call check_dataframe_nulls_nans - null_counts = check_dataframe_nulls_nans(df, column_names) + column_names: List[str] : - # Output: {"col1": 1, "col2": 1} - print(null_counts) - .. note:: - This function handles lazy evaluation defensively (e.g., using `compute` or `collect`) - to ensure compatibility with backends like Dask and Polars. It works with eager - backends such as Pandas or Polars eager mode as well, maintaining backend agnosticism. + Returns + ------- + Dict[str, int] + Dictionary mapping column names to their null value counts. + + Raises + ------ + ValueError + If the DataFrame is empty or a column is nonexistent. + UnsupportedBackendError + If the backend is unsupported. + + Examples + -------- + ```python + from temporalscope.core.core_utils import check_dataframe_nulls_nans + + # Example input DataFrame + import pandas as pd + + df = pd.DataFrame( + { + "col1": [1, 2, None], + "col2": [4, None, 6], + } + ) + + # Define columns to check + column_names = ["col1", "col2"] + + # Call check_dataframe_nulls_nans + null_counts = check_dataframe_nulls_nans(df, column_names) + + # Output: {"col1": 1, "col2": 1} + print(null_counts) + ``` + + Notes + ----- + This function handles lazy evaluation defensively (e.g., using `compute` or `collect`) + to ensure compatibility with backends like Dask and Polars. It works with eager + backends such as Pandas or Polars eager mode as well, maintaining backend agnosticism. """ # Step 0: Validate the input DataFrame @@ -799,32 +860,43 @@ def convert_to_numeric( ) -> SupportedTemporalDataFrame: """Convert a datetime column to numeric using Narwhals API. - :param df: The input DataFrame containing the column to convert. - :type df: SupportedTemporalDataFrame - :param time_col: The name of the time column to convert. - :type time_col: str - :param col_expr: The Narwhals column expression for the time column. - :type col_expr: Any - :param col_dtype: The resolved dtype of the time column. - :type col_dtype: Any - :return: The DataFrame with the converted time column. - :rtype: SupportedTemporalDataFrame - :raises ValueError: If the column is not a datetime type. - :raises UnsupportedBackendError: If the backend is not supported by TemporalScope. - - Example Usage: - -------------- - .. code-block:: python - - df = pd.DataFrame({"time": pd.date_range("2023-01-01", periods=3)}) - df = convert_to_numeric(df, "time", nw.col("time"), df["time"].dtype) - print(df) - - .. note:: - - Converts datetime columns to numeric using `dt.timestamp()`. - - Uses `time_unit="us"` for general backend compatibility. - - Ensures the resulting column is cast to `Float64` for numeric operations. - - Handles potential overflow issues for PyArrow by selecting smaller time units. + Parameters + ---------- + df : SupportedTemporalDataFrame + The input DataFrame containing the column to convert. + time_col : str + The name of the time column to convert. + col_expr : Any + The Narwhals column expression for the time column. + col_dtype : Any + The resolved dtype of the time column. + + Returns + ------- + SupportedTemporalDataFrame + The DataFrame with the converted time column. + + Raises + ------ + ValueError + If the column is not a datetime type. + UnsupportedBackendError + If the backend is not supported by TemporalScope. + + Examples + -------- + ```python + df = pd.DataFrame({"time": pd.date_range("2023-01-01", periods=3)}) + df = convert_to_numeric(df, "time", nw.col("time"), df["time"].dtype) + print(df) + ``` + + Notes + ----- + - Converts datetime columns to numeric using `dt.timestamp()`. + - Uses `time_unit="us"` for general backend compatibility. + - Ensures the resulting column is cast to `Float64` for numeric operations. + - Handles potential overflow issues for PyArrow by selecting smaller time units. """ # Validate the DataFrame @@ -850,39 +922,51 @@ def convert_datetime_column_to_numeric( microseconds ("us") by default. It ensures compatibility across all Narwhals-supported backends. - :param df: Input DataFrame containing the datetime column. - :type df: SupportedTemporalDataFrame - :param time_col: Name of the column to convert. Must be of datetime type. - :type time_col: str - :param time_unit: Time unit for conversion ("us", "ms", "ns"). Default is "us". - The choice of "us" provides optimal compatibility across - Pandas, Polars, and PyArrow backends. - :type time_unit: Literal["us", "ms", "ns"] - :return: DataFrame with the converted time column. - :rtype: SupportedTemporalDataFrame - :raises UnsupportedBackendError: If the DataFrame's backend is not supported. - :raises ValueError: If the specified column is not a datetime type or does not exist. - - Example Usage: - -------------- - .. code-block:: python - - from temporalscope.core.core_utils import convert_datetime_column_to_numeric - import pandas as pd - - # Create example DataFrame with a datetime column - df = pd.DataFrame({"time": pd.date_range(start="2023-01-01", periods=3, freq="D"), "value": [10, 20, 30]}) - - # Convert 'time' column to numeric (microseconds precision) - df = convert_datetime_column_to_numeric(df, time_col="time", time_unit="us") - print(df) - - .. note:: - - Supports microseconds ("us"), milliseconds ("ms"), and nanoseconds ("ns"). - - Preserves timezone-aware datetimes during conversion. - - Handles null values consistently across all supported backends. - - Does not enforce monotonicity or data sorting. Use a sorting utility if required. - - When using `"ns"` precision, values are cast to `Int64` to avoid overflow issues with large timestamps. + Parameters + ---------- + df : SupportedTemporalDataFrame + Input DataFrame containing the datetime column. + time_col : str + Name of the column to convert. Must be of datetime type. + time_unit : Literal["us", "ms", "ns"] + Time unit for conversion ("us", "ms", "ns"). Default is "us". + The choice of "us" provides optimal compatibility across + Pandas, Polars, and PyArrow backends. + + Returns + ------- + SupportedTemporalDataFrame + DataFrame with the converted time column. + + Raises + ------ + UnsupportedBackendError + If the DataFrame's backend is not supported. + ValueError + If the specified column is not a datetime type or does not exist. + + Examples + -------- + ```python + from temporalscope.core.core_utils import convert_datetime_column_to_numeric + import pandas as pd + + # Create example DataFrame with a datetime column + df = pd.DataFrame({"time": pd.date_range(start="2023-01-01", periods=3, freq="D"), "value": [10, 20, 30]}) + + # Convert 'time' column to numeric (microseconds precision) + df = convert_datetime_column_to_numeric(df, time_col="time", time_unit="us") + print(df) + ``` + + Notes + ----- + - Supports microseconds ("us"), milliseconds ("ms"), and nanoseconds ("ns"). + - Preserves timezone-aware datetimes during conversion. + - Handles null values consistently across all supported backends. + - Does not enforce monotonicity or data sorting. Use a sorting utility if required. + - When using `"ns"` precision, values are cast to `Int64` to avoid overflow issues with large timestamps. + """ # Step 1: Validate the DataFrame is_valid, _ = is_valid_temporal_dataframe(df) @@ -923,33 +1007,44 @@ def convert_time_column_to_datetime( ) -> SupportedTemporalDataFrame: """Convert a string or numeric column to datetime using Narwhals API. - :param df: The input DataFrame containing the column to convert. - :type df: SupportedTemporalDataFrame - :param time_col: The name of the time column to convert. - :type time_col: str - :param col_expr: The Narwhals column expression for the time column. - :type col_expr: Any - :param col_dtype: The resolved dtype of the time column. - :type col_dtype: Any - :return: The DataFrame with the converted time column. - :rtype: SupportedTemporalDataFrame - :raises ValueError: If the column is not convertible to datetime. - :raises UnsupportedBackendError: If the backend is not supported by TemporalScope. - - Example Usage: - -------------- - .. code-block:: python - - df = pd.DataFrame({"time": [1672531200000, 1672617600000]}) # Unix timestamps - df = convert_time_column_to_datetime(df, "time", nw.col("time"), df["time"].dtype) - print(df) - - .. note:: - - Handles string columns using `str.to_datetime()` for backend compatibility. - - Numeric columns are cast directly to `Datetime` using `cast(nw.Datetime())` where supported. - - For PyArrow, handles timezone preservation and default `time_unit="ns"`. - - Narwhals-backend ensures consistent behavior across lazy and eager backends. - - Raises errors for unsupported column types to prevent silent failures. + Parameters + ---------- + df : SupportedTemporalDataFrame + The input DataFrame containing the column to convert. + time_col : str + The name of the time column to convert. + col_expr : Any + The Narwhals column expression for the time column. + col_dtype : Any + The resolved dtype of the time column. + + Returns + ------- + SupportedTemporalDataFrame + The DataFrame with the converted time column. + + Raises + ------ + ValueError + If the column is not convertible to datetime. + UnsupportedBackendError + If the backend is not supported by TemporalScope. + + Examples + -------- + ```python + df = pd.DataFrame({"time": [1672531200000, 1672617600000]}) # Unix timestamps + df = convert_time_column_to_datetime(df, "time", nw.col("time"), df["time"].dtype) + print(df) + ``` + + Notes + ----- + - Handles string columns using `str.to_datetime()` for backend compatibility. + - Numeric columns are cast directly to `Datetime` using `cast(nw.Datetime())` where supported. + - For PyArrow, handles timezone preservation and default `time_unit="ns"`. + - Narwhals-backend ensures consistent behavior across lazy and eager backends. + - Raises errors for unsupported column types to prevent silent failures. """ # Validate the DataFrame @@ -968,32 +1063,43 @@ def convert_time_column_to_datetime( def validate_time_column_type(time_col: str, col_dtype: Any) -> None: """Validate that a column is either numeric or datetime. - :param time_col: The name of the time column to validate. - :type time_col: str - :param col_dtype: The resolved dtype of the time column. - :type col_dtype: Any - :raises ValueError: If the column is neither numeric nor datetime. + Parameters + ---------- + time_col : str + The name of the time column to validate. + col_dtype : Any + The resolved dtype of the time column. - Example Usage: - -------------- - .. code-block:: python + Returns + ------- + None - validate_time_column_type("time", pd.Series([1, 2, 3]).dtype) # Passes - validate_time_column_type("time", pd.Series(["2023-01-01"]).dtype) # Passes + Raises + ------ + ValueError + If the column is neither numeric nor datetime. - try: - validate_time_column_type("time", pd.Series(["abc"]).dtype) # Raises ValueError - except ValueError as e: - print(e) - # Output: Column 'time' is neither numeric nor datetime. - - .. note:: - - Validates column dtypes to ensure they are either numeric (float/int) or datetime. - - For numeric columns, supports all backend-specific numeric types (e.g., Float64, Int64). - - For datetime columns, supports both timezone-aware and naive formats (e.g., UTC, local). - - Provides clear error messages for unsupported types, ensuring better debugging in enterprise pipelines. - - Centralized validation logic avoids repeated dtype checks in other utility functions. - - Compatible with Narwhals lazy evaluation backends like Dask or Modin. + Examples + -------- + ```python + validate_time_column_type("time", pd.Series([1, 2, 3]).dtype) # Passes + validate_time_column_type("time", pd.Series(["2023-01-01"]).dtype) # Passes + + try: + validate_time_column_type("time", pd.Series(["abc"]).dtype) # Raises ValueError + except ValueError as e: + print(e) + # Output: Column 'time' is neither numeric nor datetime. + ``` + + Notes + ----- + - Validates column dtypes to ensure they are either numeric (float/int) or datetime. + - For numeric columns, supports all backend-specific numeric types (e.g., Float64, Int64). + - For datetime columns, supports both timezone-aware and naive formats (e.g., UTC, local). + - Provides clear error messages for unsupported types, ensuring better debugging in enterprise pipelines. + - Centralized validation logic avoids repeated dtype checks in other utility functions. + - Compatible with Narwhals lazy evaluation backends like Dask or Modin. """ is_numeric = "float" in str(col_dtype).lower() or "int" in str(col_dtype).lower() @@ -1010,32 +1116,44 @@ def validate_and_convert_time_column( ) -> SupportedTemporalDataFrame: """Validate and optionally convert the time column in a DataFrame. - :param df: The input DataFrame to process. - :type df: SupportedTemporalDataFrame - :param time_col: The name of the time column to validate or convert. - :type time_col: str - :param conversion_type: Optional. Specify the conversion type: - - 'numeric': Convert to Float64. - - 'datetime': Convert to Datetime. - - None: Validate only. - :type conversion_type: Optional[str] - :return: The validated and optionally converted DataFrame. - :rtype: SupportedTemporalDataFrame - :raises TimeColumnError: If validation or conversion fails or if an invalid conversion_type is provided. - :raises ValueError: If the column dtype cannot be resolved. - :raises UnsupportedBackendError: If the backend is not supported by TemporalScope. - - Example Usage: - -------------- - .. code-block:: python - - df = validate_and_convert_time_column(df, "time", conversion_type="numeric") - - .. note:: - - Validates and converts the `time_col` to the specified type (`numeric` or `datetime`). - - Uses backend-specific adjustments for PyArrow and other frameworks. - - Handles nulls and ensures consistent schema across all supported backends. - - Raises errors for invalid `conversion_type` values. + Parameters + ---------- + df : SupportedTemporalDataFrame + The input DataFrame to process. + time_col : str + The name of the time column to validate or convert. + conversion_type : Optional[str] + Optional. Specify the conversion type: + - 'numeric': Convert to Float64. + - 'datetime': Convert to Datetime. + - None: Validate only. + + Returns + ------- + SupportedTemporalDataFrame + The validated and optionally converted DataFrame. + + Raises + ------ + TimeColumnError + If validation or conversion fails or if an invalid conversion_type is provided. + ValueError + If the column dtype cannot be resolved. + UnsupportedBackendError + If the backend is not supported by TemporalScope. + + Examples + -------- + ```python + df = validate_and_convert_time_column(df, "time", conversion_type="numeric") + ``` + + Notes + ----- + - Validates and converts the `time_col` to the specified type (`numeric` or `datetime`). + - Uses backend-specific adjustments for PyArrow and other frameworks. + - Handles nulls and ensures consistent schema across all supported backends. + - Raises errors for invalid `conversion_type` values. """ # Validate the DataFrame @@ -1076,25 +1194,38 @@ def validate_dataframe_column_types(df: SupportedTemporalDataFrame, time_col: st - The specified `time_col` must be of type numeric or datetime. - All other columns in the DataFrame must be of numeric type. - :param df: The input DataFrame to validate. - :type df: SupportedTemporalDataFrame - :param time_col: The name of the time column to validate. - :type time_col: str - :raises ValueError: If the `time_col` does not exist or has an invalid type. - :raises ValueError: If any non-time column has an invalid type. - :raises UnsupportedBackendError: If the backend is not supported by TemporalScope. + Parameters + ---------- + df : SupportedTemporalDataFrame + The input DataFrame to validate. + time_col : str + The name of the time column to validate. + + Returns + ------- + None - Example Usage: - -------------- - .. code-block:: python + Raises + ------ + ValueError + If the `time_col` does not exist or has an invalid type. + ValueError + If any non-time column has an invalid type. + UnsupportedBackendError + If the backend is not supported by TemporalScope. - from temporalscope.core.temporal_data_loader import validate_dataframe_column_types + Examples + -------- + ```python + from temporalscope.core.temporal_data_loader import validate_dataframe_column_types - # Example DataFrame - df = pd.DataFrame({"time": pd.date_range("2023-01-01", periods=5), "value": [1.0, 2.0, 3.0, 4.0, 5.0]}) + # Example DataFrame + df = pd.DataFrame({"time": pd.date_range("2023-01-01", periods=5), "value": [1.0, 2.0, 3.0, 4.0, 5.0]}) + + # Validate column types + validate_dataframe_column_types(df, time_col="time") + ``` - # Validate column types - validate_dataframe_column_types(df, time_col="time") """ # Validate the DataFrame is_valid, _ = is_valid_temporal_dataframe(df) @@ -1134,45 +1265,59 @@ def sort_dataframe_time( ) -> SupportedTemporalDataFrame: """Sort a DataFrame by the specified time column using Narwhals' backend-agnostic operations. - :param df: The input DataFrame to sort. - :type df: SupportedTemporalDataFrame - :param time_col: The name of the column to sort by. Must exist in the DataFrame. - :type time_col: str - :param ascending: Sort direction. Defaults to True (ascending order). - :type ascending: bool, optional - :return: A DataFrame sorted by the specified time column. - :rtype: SupportedTemporalDataFrame - :raises ValueError: If the `time_col` does not exist in the DataFrame or has invalid type. - :raises UnsupportedBackendError: If the backend is not supported by TemporalScope. - - Example Usage: - -------------- - .. code-block:: python - - from temporalscope.core.core_utils import sort_dataframe_time - import pandas as pd - - # Example DataFrame - df = pd.DataFrame({"time": [3, 1, 4, 2, 5], "value": range(5)}) - - # Sort DataFrame by the 'time' column in ascending order - sorted_df = sort_dataframe_time(df, time_col="time", ascending=True) - print(sorted_df) - - .. note:: - - The `@nw.narwhalify` decorator automatically handles backend detection - and adapts sorting to Pandas, Modin, Dask, Polars, and PyArrow. - - Validates that the time column exists and has a valid type (numeric or datetime). - - Uses string column names for sorting to ensure compatibility across all backends. - - Handles lazy evaluation in backends like Dask and Polars. - - **Dask Specific Note**: - - DaskLazyFrame requires explicit computation using `collect()` or `compute()`. - - This function ensures such materialization happens before sorting. - - .. warning:: - Sorting large DataFrames in lazy backends like Dask or Polars may cause - computations or require additional memory. Ensure memory constraints are handled. + Parameters + ---------- + df : SupportedTemporalDataFrame + The input DataFrame to sort. + time_col : str + The name of the column to sort by. Must exist in the DataFrame. + ascending : bool, optional + Sort direction. Defaults to True (ascending order). + + Returns + ------- + SupportedTemporalDataFrame + A DataFrame sorted by the specified time column. + + Raises + ------ + ValueError + If the `time_col` does not exist in the DataFrame or has invalid type. + UnsupportedBackendError + If the backend is not supported by TemporalScope. + + Examples + -------- + ```python + from temporalscope.core.core_utils import sort_dataframe_time + import pandas as pd + + # Example DataFrame + df = pd.DataFrame({"time": [3, 1, 4, 2, 5], "value": range(5)}) + + # Sort DataFrame by the 'time' column in ascending order + sorted_df = sort_dataframe_time(df, time_col="time", ascending=True) + print(sorted_df) + ``` + + Notes + ----- + - The `@nw.narwhalify` decorator automatically handles backend detection + and adapts sorting to Pandas, Modin, Dask, Polars, and PyArrow. + - Validates that the time column exists and has a valid type (numeric or datetime). + - Uses string column names for sorting to ensure compatibility across all backends. + - Handles lazy evaluation in backends like Dask and Polars. + + **Dask Specific Note**: + + - DaskLazyFrame requires explicit computation using `collect()` or `compute()`. + - This function ensures such materialization happens before sorting. + + Warnings + -------- + Sorting large DataFrames in lazy backends like Dask or Polars may cause + computations or require additional memory. Ensure memory constraints are handled. + """ # Validate the DataFrame is_valid, _ = is_valid_temporal_dataframe(df) @@ -1205,62 +1350,74 @@ def validate_temporal_uniqueness( the groups defined by the `id_col` (e.g., within a patient's records). It does not enforce ordering, allowing for mixed-frequency data and flexible temporal patterns. - :param df: The DataFrame to validate. - :type df: SupportedTemporalDataFrame - :param time_col: The column representing time. - :type time_col: str - :param raise_error: Whether to raise an error if validation fails. - :type raise_error: bool - :param id_col: An optional string to indicate the grouping identifier (e.g., group name). - :type id_col: str - :raises ValueError: If validation fails and `raise_error` is True. - :warns UserWarning: If validation fails and `raise_error` is False. - - Example Usage: - -------------- - .. code-block:: python - - import narwhals as nw - import pandas as pd - - # Create insurance claims data with patient visits - df = pd.DataFrame( - { - "patient_id": [1, 1, 1, 2, 2], - "time": [ - "2023-01-01", - "2023-02-15", - "2023-04-01", # Patient 1's visits - "2023-01-01", - "2023-03-15", - ], # Patient 2's visits - "claim_amount": [100.0, 250.0, 150.0, 300.0, 200.0], - } - ) - - # Validate timestamps within each patient's records - for patient in df["patient_id"].unique(): - patient_records = df[df["patient_id"] == patient] - validate_temporal_uniqueness( - patient_records, time_col="time", id_col=f"patient {patient} " - ) # Will pass - each patient has unique visit dates - - # Example with duplicate timestamps - df_invalid = pd.DataFrame( - { - "patient_id": [1, 1, 1], - "time": ["2023-01-01", "2023-01-01", "2023-02-15"], # Duplicate visit date - "claim_amount": [100.0, 150.0, 200.0], - } - ) - - # This will raise ValueError: "Duplicate timestamps in patient 1 column 'time'." - validate_temporal_uniqueness(df_invalid, time_col="time", id_col="patient 1 ") + Parameters + ---------- + df : SupportedTemporalDataFrame + The DataFrame to validate. + time_col : str + The column representing time. + raise_error : bool + Whether to raise an error if validation fails. + id_col : str + An optional string to indicate the grouping identifier (e.g., group name). + + Returns + ------- + None + + Raises + ------ + ValueError + If validation fails and `raise_error` is True. + :warns UserWarning: If validation fails and `raise_error` is False. + + Examples + -------- + ```python + import narwhals as nw + import pandas as pd + + # Create insurance claims data with patient visits + df = pd.DataFrame( + { + "patient_id": [1, 1, 1, 2, 2], + "time": [ + "2023-01-01", + "2023-02-15", + "2023-04-01", # Patient 1's visits + "2023-01-01", + "2023-03-15", + ], # Patient 2's visits + "claim_amount": [100.0, 250.0, 150.0, 300.0, 200.0], + } + ) + + # Validate timestamps within each patient's records + for patient in df["patient_id"].unique(): + patient_records = df[df["patient_id"] == patient] + validate_temporal_uniqueness( + patient_records, time_col="time", id_col=f"patient {patient} " + ) # Will pass - each patient has unique visit dates + + # Example with duplicate timestamps + df_invalid = pd.DataFrame( + { + "patient_id": [1, 1, 1], + "time": ["2023-01-01", "2023-01-01", "2023-02-15"], # Duplicate visit date + "claim_amount": [100.0, 150.0, 200.0], + } + ) + + # This will raise ValueError: "Duplicate timestamps in patient 1 column 'time'." + validate_temporal_uniqueness(df_invalid, time_col="time", id_col="patient 1 ") + ``` + + Notes + ----- + - This function only validates uniqueness within the given `id_col` (e.g., per patient). + - It does not enforce temporal ordering, allowing for mixed-frequency data. + - Different `id_col` groups (e.g., different patients) can have events on the same dates. - .. note:: - - This function only validates uniqueness within the given `id_col` (e.g., per patient). - - It does not enforce temporal ordering, allowing for mixed-frequency data. - - Different `id_col` groups (e.g., different patients) can have events on the same dates. """ # Step 1: Validate time column type and convert if needed try: diff --git a/src/temporalscope/core/exceptions.py b/src/temporalscope/core/exceptions.py index 2bd23b2..36fae7d 100644 --- a/src/temporalscope/core/exceptions.py +++ b/src/temporalscope/core/exceptions.py @@ -15,23 +15,6 @@ # specific language governing permissions and limitations # under the License. -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - """TemporalScope/src/temporalscope/core/exceptions.py This module defines custom exceptions and warnings used throughout the TemporalScope package. @@ -43,6 +26,7 @@ class TimeFrameError(Exception): """Base class for exceptions in the TimeFrame module. This serves as the foundation for all `TimeFrame`-related errors. + """ pass @@ -51,15 +35,12 @@ class TimeFrameError(Exception): class TimeColumnError(TimeFrameError): """Exception raised for validation issues with `time_col`. - :param message: Explanation of the error. - :type message: str - - Example Usage: - -------------- - .. code-block:: python - - if not nw.is_numeric(df[time_col]) and not nw.is_timestamp(df[time_col]): - raise TimeColumnError("`time_col` must be numeric or timestamp-like.") + Examples + -------- + ```python + if not nw.is_numeric(df[time_col]) and not nw.is_timestamp(df[time_col]): + raise TimeColumnError("`time_col` must be numeric or timestamp-like.") + ``` """ pass @@ -71,15 +52,16 @@ class TargetColumnWarning(UserWarning): This warning is issued when the target column appears to contain sequential or vectorized data, which may require transformation depending on the selected mode (e.g., `MODE_MULTI_TARGET`). - Example Usage: - -------------- - .. code-block:: python + Examples + -------- + ```python + if mode == "multi_target" and target_col_is_vectorized: + warnings.warn( + "`target_col` appears to contain sequential data. Ensure it is transformed appropriately for MODE_MULTI_TARGET.", + TargetColumnWarning, + ) + ``` - if mode == "multi_target" and target_col_is_vectorized: - warnings.warn( - "`target_col` appears to contain sequential data. Ensure it is transformed appropriately for MODE_MULTI_TARGET.", - TargetColumnWarning, - ) """ pass @@ -88,26 +70,29 @@ class TargetColumnWarning(UserWarning): class ModeValidationError(TimeFrameError): """Exception raised when an invalid mode is specified. - :param mode: The invalid mode that caused the error. - :type mode: str - :param message: Explanation of the error. - :type message: str - - Example Usage: - -------------- - .. code-block:: python - - if mode not in VALID_MODES: - raise ModeValidationError(mode, f"Invalid mode: {mode}. Must be one of {VALID_MODES}.") + Parameters + ---------- + mode : str + The invalid mode that caused the error. + message : str + + Examples + -------- + ```python + if mode not in VALID_MODES: + raise ModeValidationError(mode, f"Invalid mode: {mode}. Must be one of {VALID_MODES}.") + ``` """ def __init__(self, mode, message="Invalid mode specified"): """Initialize ModeValidationError. - Args: - mode: The invalid mode that caused the error - message: Explanation of the error, defaults to "Invalid mode specified" - + Parameters + ---------- + mode : str + The invalid mode that caused the error. + message : str + The error message to display. """ self.mode = mode self.message = f"{message}: {mode}" @@ -117,27 +102,23 @@ def __init__(self, mode, message="Invalid mode specified"): class UnsupportedBackendError(TimeFrameError): """Exception raised when an unsupported backend is encountered. - :param backend: The invalid backend that caused the error. - :type backend: str - :param message: Explanation of the error. - :type message: str + Parameters + ---------- + backend : str + The invalid backend that caused the error. + message : str - Example Usage: - -------------- - .. code-block:: python + Examples + -------- + ```python + if backend not in TEMPORALSCOPE_CORE_BACKENDS: + raise UnsupportedBackendError(backend) + ``` - if backend not in TEMPORALSCOPE_CORE_BACKENDS: - raise UnsupportedBackendError(backend) """ def __init__(self, backend, message="Unsupported backend"): - """Initialize UnsupportedBackendError. - - Args: - backend: The invalid backend that caused the error. - message: Explanation of the error, defaults to "Unsupported backend". - - """ + """Initialize UnsupportedBackendError.""" self.backend = backend self.message = f"{message}: {backend}." super().__init__(self.message) diff --git a/src/temporalscope/core/temporal_data_loader.py b/src/temporalscope/core/temporal_data_loader.py index 6253cbc..e3e062a 100644 --- a/src/temporalscope/core/temporal_data_loader.py +++ b/src/temporalscope/core/temporal_data_loader.py @@ -26,25 +26,16 @@ AI Modeling for Time Series Data: --------------------------------- +--------------------------------- TemporalScope is designed with several key assumptions to ensure performance, scalability, and flexibility across a wide range of time series forecasting and XAI workflows: -+------------------------+-----------------------------------------------+ -| Approach | Description | -+------------------------+-----------------------------------------------+ -| Implicit & Static Time | The `time_col` is treated as a feature, | -| Series | enabling ML/DL workflows with mixed-frequency | -| | datasets. By default, `enforce_temporal_uniqueness` | -| | is False. | -+------------------------+-----------------------------------------------+ -| Strict Time Series | Enforces temporal ordering and uniqueness, | -| | suited for forecasting. Group or segment | -| | validation is supported via the `id_col` | -| | parameter. | -+------------------------+-----------------------------------------------+ +| Approach | Description | +|----------|-------------| +| Implicit & Static Time Series | The `time_col` is treated as a feature, enabling ML/DL workflows with mixed-frequency datasets. By default, `enforce_temporal_uniqueness` is False. | +| Strict Time Series | Enforces temporal ordering and uniqueness, suited for forecasting. Group or segment validation is supported via the `id_col` parameter. | 1. Preprocessed Data Requirement: TemporalScope assumes users provide preprocessed data, including categorical @@ -73,63 +64,59 @@ 5. Supported Data Modes: -+--------------------+-------------------------------------------+---------------------------+ -| Mode | Description | Compatible Frameworks | -+--------------------+-------------------------------------------+---------------------------+ -| Single-step mode | For scalar target machine learning tasks. | Scikit-learn, XGBoost, | -| | Each row represents a single time step. | LightGBM, SHAP, TensorFlow| -| | | (standard regression) | -+--------------------+-------------------------------------------+---------------------------+ -| Multi-step mode | For sequence forecasting tasks. Input | TensorFlow, PyTorch, | -| | sequences (`X`) and output sequences | Keras, SHAP, LIME | -| | (`Y`) are handled as separate datasets. | | -+--------------------+-------------------------------------------+---------------------------+ +| Mode | Description | Compatible Frameworks | +|------|-------------|------------------------| +| Single-step mode | For scalar target machine learning tasks. Each row represents a single time step. | Scikit-learn, XGBoost, LightGBM, SHAP, TensorFlow (standard regression) | +| Multi-step mode | For sequence forecasting tasks. Input sequences (`X`) and output sequences (`Y`) are handled as separate datasets. | TensorFlow, PyTorch, Keras, SHAP, LIME | By enforcing these constraints, TemporalScope focuses on its core purpose—time series partitioning, explainability, and scalability—while leaving more general preprocessing tasks to the user. This follows industry standards seen in popular time series libraries. -.. seealso:: - - 1. Van Ness, M., Shen, H., Wang, H., Jin, X., Maddix, D.C., & Gopalswamy, K. - (2023). Cross-Frequency Time Series Meta-Forecasting. arXiv preprint - arXiv:2302.02077. - - 2. Woo, G., Liu, C., Kumar, A., Xiong, C., Savarese, S., & Sahoo, D. (2024). - Unified training of universal time series forecasting transformers. arXiv - preprint arXiv:2402.02592. - - 3. Trirat, P., Shin, Y., Kang, J., Nam, Y., Bae, M., Kim, J., Kim, B., & - Lee, J.-G. (2024). Universal time-series representation learning: A survey. - arXiv preprint arXiv:2401.03717. - - 4. Xu, Q., Zhuo, X., Jiang, C., & Liu, Y. (2019). An artificial neural network - for mixed frequency data. Expert Systems with Applications, 118, pp.127-139. - - 5. Filho, L.L., de Oliveira Werneck, R., Castro, M., Ribeiro Mendes Júnior, P., - Lustosa, A., Zampieri, M., Linares, O., Moura, R., Morais, E., Amaral, M., & - Salavati, S. (2024). A multi-modal approach for mixed-frequency time series - forecasting. Neural Computing and Applications, pp.1-25. - -.. note:: - - Multi-Step Mode Limitation: Currently unsupported due to limitations in - backends like Modin and Polars, which lack native support for vectorized - (sequence-based) targets in a single cell. Future updates will include - compatibility with formats like TensorFlow's `tf.data.Dataset` or flattened - target sequences. - - Single-Step Mode Support: Narwhals-backed operations ensure that single-step - mode is fully supported across Pandas, Modin, and Polars without additional - adjustments, handling scalar target workflows seamlessly. - - Recommendation: Use Pandas for multi-step workflows as it best supports - the necessary data structures. Future releases will extend compatibility - for vectorized targets across all backends. - -.. seealso:: - - - Narwhals documentation: https://narwhals.readthedocs.io/ - - SHAP documentation: https://shap.readthedocs.io/ - - Boruta-SHAP documentation: https://github.com/Ekeany/Boruta-Shap - - LIME documentation: https://lime-ml.readthedocs.io/ +References +---------- +- Filho, L.L., de Oliveira Werneck, R., Castro, M., Ribeiro Mendes Júnior, P., +Lustosa, A., Zampieri, M., Linares, O., Moura, R., Morais, E., Amaral, M., & +Salavati, S. (2024). A multi-modal approach for mixed-frequency time series +forecasting. Neural Computing and Applications, pp.1-25. + +- Trirat, P., Shin, Y., Kang, J., Nam, Y., Bae, M., Kim, J., Kim, B., & +Lee, J.-G. (2024). Universal time-series representation learning: A survey. +arXiv preprint arXiv:2401.03717. + +- Van Ness, M., Shen, H., Wang, H., Jin, X., Maddix, D.C., & Gopalswamy, K. +(2023). Cross-Frequency Time Series Meta-Forecasting. arXiv preprint +arXiv:2302.02077. + +- Woo, G., Liu, C., Kumar, A., Xiong, C., Savarese, S., & Sahoo, D. (2024). +Unified training of universal time series forecasting transformers. arXiv +preprint arXiv:2402.02592. + +- Xu, Q., Zhuo, X., Jiang, C., & Liu, Y. (2019). An artificial neural network +for mixed frequency data. Expert Systems with Applications, 118, pp.127-139. + + + +Notes +----- +- Multi-Step Mode Limitation: Currently unsupported due to limitations in + backends like Modin and Polars, which lack native support for vectorized + (sequence-based) targets in a single cell. Future updates will include + compatibility with formats like TensorFlow's `tf.data.Dataset` or flattened + target sequences. +- Single-Step Mode Support: Narwhals-backed operations ensure that single-step + mode is fully supported across Pandas, Modin, and Polars without additional + adjustments, handling scalar target workflows seamlessly. +- Recommendation: Use Pandas for multi-step workflows as it best supports + the necessary data structures. Future releases will extend compatibility + for vectorized targets across all backends. + +See Also +-------- +- Narwhals documentation: https://narwhals.readthedocs.io/ +- SHAP documentation: https://shap.readthedocs.io/ +- Boruta-SHAP documentation: https://github.com/Ekeany/Boruta-Shap +- LIME documentation: https://lime-ml.readthedocs.io/ """ from typing import Any, Dict, Optional @@ -165,8 +152,8 @@ class TimeFrame: compatibility with temporal XAI techniques (SHAP, Boruta-SHAP, LIME etc) supporting larger data workflows in production. - Engineering Design Assumptions - ------------------ + Engineering Design Assumptions: + ------------------------------- - Universal Models: This class is designed assuming the user has pre-processed their data for compatibility with deep learning models. Across the TemporalScope utilities (e.g., target shifter, padding, partitioning algorithms), it is assumed that preprocessing tasks, such as categorical feature encoding, will be managed by the user or @@ -179,20 +166,21 @@ class TimeFrame: - All non-time columns are expected to be numeric. Users are responsible for handling non-numeric features (e.g., encoding categorical features). - Backend Handling - ---------------- + Backend Handling: + ----------------- - If a `dataframe_backend` is explicitly provided, it takes precedence over backend inference. - If no backend is specified, the class infers the backend from the DataFrame type, supporting Polars, Pandas, and Modin. - Example Usage - ------------- - .. code-block:: python + Examples + -------- + ```python + import polars as pl - import polars as pl + data = pl.DataFrame({"time": pl.date_range(start="2021-01-01", periods=100, interval="1d"), "value": range(100)}) + tf = TimeFrame(data, time_col="time", target_col="value") + print(tf.get_data().head()) + ``` - data = pl.DataFrame({"time": pl.date_range(start="2021-01-01", periods=100, interval="1d"), "value": range(100)}) - tf = TimeFrame(data, time_col="time", target_col="value") - print(tf.get_data().head()) """ def __init__( @@ -209,7 +197,8 @@ def __init__( id_col: Optional[str] = None, verbose: bool = False, ) -> None: - """Initialize a TimeFrame object with required validations, backend handling, and optional time column conversion. + """ + Initialize a TimeFrame object with required validations, backend handling, and optional time column conversion. This constructor initializes the `TimeFrame` object, validates the input DataFrame, and performs optional sorting based on the specified `time_col`. It also allows for @@ -219,83 +208,95 @@ def __init__( There are two common use cases for `TimeFrame`: - 1. Implicit & Static Time Series: - For workflows where `time_col` is treated as a feature, such as in static - modeling for ML/DL applications, `enforce_temporal_uniqueness` can remain - `False` (default). This mode emphasizes a universal design, accommodating - mixed-frequency data. - - 2. Strict Time Series: - For workflows requiring strict temporal ordering and uniqueness (e.g., - forecasting), set `enforce_temporal_uniqueness=True`. Additionally, - specify `id_col` for grouped or segmented validation. - - :param df: The input DataFrame, which can be any TemporalScope-supported backend - (e.g., Pandas, Modin, Polars). - :type df: SupportedTemporalDataFrame - :param time_col: The name of the column representing time. Must be numeric or - timestamp-like for sorting. - :type time_col: str - :param target_col: The name of the column representing the target variable. - :type target_col: str - :param dataframe_backend: The backend to use. If not provided, it will be inferred - based on the DataFrame type. - :type dataframe_backend: Optional[str] - :param sort: If True, the data will be sorted by `time_col`. Default is True. - :type sort: bool - :param ascending: If sorting, whether to sort in ascending order. Default is True. - :type ascending: bool - :param mode: The operation mode, either `MODE_SINGLE_TARGET` (default) or `MODE_MULTI_TARGET`. - :type mode: str - :param time_col_conversion: Optional. Specify the conversion type for the `time_col`: - - 'numeric': Convert to Float64. - - 'datetime': Convert to Datetime. - - None: Validate only. - Default is None. - :type time_col_conversion: Optional[str] - :param enforce_temporal_uniqueness: If True, ensures that timestamps in `time_col` are unique within - each group (defined by `id_col`) or globally if `id_col` is None. - This setting is essential for workflows requiring temporal - consistency, such as forecasting or explainability analysis. - Default is False. - :type enforce_temporal_uniqueness: bool - :param id_col: Optional column for grouped or segmented strict temporal validation. - Default is None. - :type id_col: Optional[str] - :param verbose: If True, enables logging for validation and setup stages. - :type verbose: bool - :raises ModeValidationError: If the specified mode is invalid. - :raises UnsupportedBackendError: If the specified or inferred backend is not supported. - :raises ValueError: If required columns are missing, invalid, or if the time column - conversion fails. - - :ivar _metadata: A private metadata dictionary to allow end-users flexibility in extending the TimeFrame object. - This provides storage for any additional attributes or information during runtime. - :type _metadata: Dict[str, Any] - - Example Usage: - -------------- - .. code-block:: python - - import pandas as pd - from temporalscope.core.temporal_data_loader import TimeFrame, MODE_SINGLE_TARGET - - # Example DataFrame - df = pd.DataFrame({"time": pd.date_range(start="2023-01-01", periods=10, freq="D"), "value": range(10)}) - - # Initialize TimeFrame with automatic time column conversion to numeric - tf = TimeFrame(df, time_col="time", target_col="value", time_col_convert_numeric=True, mode=MODE_SINGLE_TARGET) - print(tf.df.head()) - - .. note:: - - The `mode` parameter must be one of: - - `"single_target"`: For scalar target predictions (e.g., regression). - - `"multi_target"`: For sequence forecasting tasks (e.g., seq2seq models). - - The `time_col_conversion` parameter allows for automatic conversion of the `time_col` to either numeric - or datetime during initialization. - - The `_metadata` container follows design patterns similar to SB3, enabling users to manage custom attributes - and extend functionality for advanced workflows, such as future conversion to TensorFlow or PyTorch types - in multi-target explainable AI workflows. + 1. Implicit & Static Time Series: + For workflows where `time_col` is treated as a feature, such as in static + modeling for ML/DL applications, `enforce_temporal_uniqueness` can remain + `False` (default). This mode emphasizes a universal design, accommodating + mixed-frequency data. + + 2. Strict Time Series: + For workflows requiring strict temporal ordering and uniqueness (e.g., + forecasting), set `enforce_temporal_uniqueness=True`. Additionally, + specify `id_col` for grouped or segmented validation. + + Parameters + ---------- + df : SupportedTemporalDataFrame + The input DataFrame, which can be any TemporalScope-supported backend + (e.g., Pandas, Modin, Polars). + time_col : str + The name of the column representing time. Must be numeric or + timestamp-like for sorting. + target_col : str + The name of the column representing the target variable. + time_col_conversion : Optional[str], optional + Specify the conversion type for the `time_col`: + - 'numeric': Convert to Float64. + - 'datetime': Convert to Datetime. + - None: Validate only. + Default is None. + dataframe_backend : Optional[str], optional + The backend to use. If not provided, it will be inferred + based on the DataFrame type. Default is None. + sort : bool, optional + If True, the data will be sorted by `time_col`. Default is True. + ascending : bool, optional + If sorting, whether to sort in ascending order. Default is True. + mode : str, optional + The operation mode, either `MODE_SINGLE_TARGET` (default) or `MODE_MULTI_TARGET`. + Default is MODE_SINGLE_TARGET. + enforce_temporal_uniqueness : bool, optional + If True, ensures that timestamps in `time_col` are unique within + each group (defined by `id_col`) or globally if `id_col` is None. + This setting is essential for workflows requiring temporal + consistency, such as forecasting or explainability analysis. + Default is False. + id_col : Optional[str], optional + Optional column for grouped or segmented strict temporal validation. + Default is None. + verbose : bool, optional + If True, enables logging for validation and setup stages. + Default is False. + + Raises + ------ + ModeValidationError + If the specified mode is invalid. + UnsupportedBackendError + If the specified or inferred backend is not supported. + ValueError + If required columns are missing, invalid, or if the time column + conversion fails. + + Attributes + ---------- + _metadata : Dict[str, Any] + A private metadata dictionary to allow end-users flexibility in extending + the TimeFrame object. This provides storage for any additional attributes + or information during runtime. + + Examples + -------- + ```python + import pandas as pd + from temporalscope.core.temporal_data_loader import TimeFrame, MODE_SINGLE_TARGET + + # Example DataFrame + df = pd.DataFrame({"time": pd.date_range(start="2023-01-01", periods=10, freq="D"), "value": range(10)}) + + # Initialize TimeFrame with automatic time column conversion to numeric + tf = TimeFrame(df, time_col="time", target_col="value", time_col_convert_numeric=True, mode=MODE_SINGLE_TARGET) + print(tf.df.head()) + ``` + + Warnings + -------- + - The `mode` parameter must be one of: + - `"single_target"`: For scalar target predictions (e.g., regression). + - `"multi_target"`: For sequence forecasting tasks (e.g., seq2seq models). + - The `time_col_conversion` parameter allows for automatic conversion of the `time_col` to either numeric or datetime during initialization. + - The `_metadata` container follows design patterns similar to SB3, enabling users to manage custom attributes and extend functionality for advanced workflows, such as + future conversion to TensorFlow or PyTorch types in multi-target explainable AI workflows. """ # Initialize instance variables self._time_col = time_col @@ -331,8 +332,17 @@ def __init__( def _validate_parameters(self) -> None: """Validate input parameters for the TimeFrame initialization. - :raises TypeError: If any parameter has an invalid type. - :raises ValueError: If a parameter value is invalid or unsupported. + Returns + ------- + None + + Raises + ------ + TypeError + If any parameter has an invalid type. + ValueError + If a parameter value is invalid or unsupported. + """ if not isinstance(self._time_col, str): raise TypeError(f"`time_col` must be a string. Got {type(self._time_col).__name__}.") @@ -359,13 +369,27 @@ def _validate_parameters(self) -> None: def _initialize_backend(self, df: SupportedTemporalDataFrame, dataframe_backend: Optional[str]) -> str: """Determine and validate the backend for the DataFrame. - :param df: Input DataFrame to initialize the backend. - :type df: SupportedTemporalDataFrame - :param dataframe_backend: Backend to use. If None, it is inferred from the DataFrame type. - :type dataframe_backend: Optional[str] - :return: Initialized backend for the DataFrame. - :rtype: str - :raises UnsupportedBackendError: If the backend is invalid or unsupported. + Parameters + ---------- + df : SupportedTemporalDataFrame + Input DataFrame to initialize the backend. + dataframe_backend : Optional[str] + Backend to use. If None, it is inferred from the DataFrame type. + df: SupportedTemporalDataFrame : + + dataframe_backend: Optional[str] : + + + Returns + ------- + str + Initialized backend for the DataFrame. + + Raises + ------ + UnsupportedBackendError + If the backend is invalid or unsupported. + """ if dataframe_backend: is_valid_temporal_backend(dataframe_backend) @@ -380,27 +404,38 @@ def _initialize_backend(self, df: SupportedTemporalDataFrame, dataframe_backend: def sort_dataframe_time(self, df: SupportedTemporalDataFrame, ascending: bool = True) -> SupportedTemporalDataFrame: """Sort DataFrame by time column using backend-agnostic Narwhals operations. - :param df: DataFrame to sort - :type df: SupportedTemporalDataFrame - :param ascending: Sort direction, defaults to True - :type ascending: bool - :return: Sorted DataFrame - :rtype: SupportedTemporalDataFrame - - Example Usage: - -------------- - .. code-block:: python - - import polars as pl - from temporalscope.core.temporal_data_loader import TimeFrame - - data = pl.DataFrame({"time": [3, 1, 4, 2, 5], "target": range(5)}) - tf = TimeFrame(data, time_col="time", target_col="target", sort=False) - sorted_df = tf.sort_dataframe_time(tf.df, ascending=True) - print(sorted_df) # Shows data sorted by time column + Parameters + ---------- + df : SupportedTemporalDataFrame + DataFrame to sort + ascending : bool + Sort direction, defaults to True + df: SupportedTemporalDataFrame : + + ascending: bool : + (Default value = True) + + Returns + ------- + SupportedTemporalDataFrame + + Examples + -------- + ```python + import polars as pl + from temporalscope.core.temporal_data_loader import TimeFrame + + data = pl.DataFrame({"time": [3, 1, 4, 2, 5], "target": range(5)}) + tf = TimeFrame(data, time_col="time", target_col="target", sort=False) + sorted_df = tf.sort_dataframe_time(tf.df, ascending=True) + print(sorted_df) # Shows data sorted by time column + ``` + + Notes + ----- + Uses the reusable utility function `sort_dataframe_time` for consistency across the codebase. + Sorted DataFrame - .. note:: - Uses the reusable utility function `sort_dataframe_time` for consistency across the codebase. """ return sort_dataframe_time(df, time_col=self._time_col, ascending=ascending) @@ -415,35 +450,50 @@ def validate_dataframe(self, df: SupportedTemporalDataFrame) -> None: The method is designed to support backend-agnostic operations through Narwhals and handles different DataFrame backends such as Pandas, Polars, and Modin. - :param df: Input DataFrame to validate. - :type df: SupportedTemporalDataFrame - :raises ValueError: If any columns contain nulls/NaNs or invalid data types. - :raises UnsupportedBackendError: If the backend is not supported. + Parameters + ---------- + df : SupportedTemporalDataFrame + Input DataFrame to validate. + df: SupportedTemporalDataFrame : - Example Usage: - -------------- - .. code-block:: python - import pandas as pd - from temporalscope.core.temporal_data_loader import TimeFrame + Returns + ------- + None - # Sample DataFrame - df = pd.DataFrame( - { - "time": pd.date_range(start="2023-01-01", periods=5, freq="D"), - "value": range(5), - } - ) + Raises + ------ + ValueError + If any columns contain nulls/NaNs or invalid data types. + UnsupportedBackendError + If the backend is not supported. + + Examples + -------- + ```python + import pandas as pd + from temporalscope.core.temporal_data_loader import TimeFrame + + # Sample DataFrame + df = pd.DataFrame( + { + "time": pd.date_range(start="2023-01-01", periods=5, freq="D"), + "value": range(5), + } + ) - # Initialize a TimeFrame object - tf = TimeFrame(df, time_col="time", target_col="value") + # Initialize a TimeFrame object + tf = TimeFrame(df, time_col="time", target_col="value") - # Validate the DataFrame - tf.validate_dataframe(df) + # Validate the DataFrame + tf.validate_dataframe(df) + ``` + + Notes + ----- + - This function ensures that `time_col` is valid and optionally convertible. + - All other columns must be numeric and free from null values. - .. note:: - - This function ensures that `time_col` is valid and optionally convertible. - - All other columns must be numeric and free from null values. """ # Step 1: Ensure all columns are free of nulls and NaNs null_counts = check_dataframe_nulls_nans(df, df.columns) @@ -475,75 +525,88 @@ def setup( Steps: ------ - 1. Validate the input DataFrame using the `validate_dataframe` method. - 2. Optionally convert the `time_col` to the specified type (`numeric` or `datetime`). - 3. Perform temporal uniqueness validation within groups if enabled. - 4. Optionally sort the DataFrame by `time_col` in the specified order. - - :param df: Input DataFrame to set up and validate. - :type df: SupportedTemporalDataFrame - :param sort: Whether to sort the DataFrame by `time_col`. Defaults to True. - :type sort: bool - :param ascending: Sort order if sorting is enabled. Defaults to True. - :type ascending: bool - :param time_col_conversion: Optional. Specify the conversion type for the `time_col`: - - 'numeric': Convert to Float64. - - 'datetime': Convert to Datetime. - - None: Validate only. - Default is None. - :type time_col_conversion: Optional[str] - :param enforce_temporal_uniqueness: If True, validates that timestamps in the `time_col` are - unique within the groups defined by the `id_col` parameter - (if specified) or across the entire DataFrame. Default is False. - :type enforce_temporal_uniqueness: bool - :param id_col: An optional column name to define groups for temporal uniqueness validation. If None, - validation is performed across the entire DataFrame. Default is None. - :type id_col: Optional[str] - :return: Validated, converted, and optionally sorted DataFrame. - :rtype: SupportedTemporalDataFrame + 1. Validate the input DataFrame using the `validate_dataframe` method. + 2. Optionally convert the `time_col` to the specified type (`numeric` or `datetime`). + 3. Perform temporal uniqueness validation within groups if enabled. + 4. Optionally sort the DataFrame by `time_col` in the specified order. + + Parameters + ---------- + df : SupportedTemporalDataFrame + Input DataFrame to set up and validate. + sort : bool + Whether to sort the DataFrame by `time_col`. Defaults to True. + ascending : bool + Sort order if sorting is enabled. Defaults to True. + time_col_conversion : Optional[str] + Optional. Specify the conversion type for the `time_col`: + - 'numeric': Convert to Float64. + - 'datetime': Convert to Datetime. + - None: Validate only. + Default is None. + enforce_temporal_uniqueness : bool + If True, validates that timestamps in the `time_col` are + unique within the groups defined by the `id_col` parameter + (if specified) or across the entire DataFrame. Default is False. + id_col : Optional[str] + An optional column name to define groups for temporal uniqueness validation. If None, + validation is performed across the entire DataFrame. Default is None. + df: SupportedTemporalDataFrame : + + sort: bool : + (Default value = True) + ascending: bool : + (Default value = True) + time_col_conversion: Optional[str] : + (Default value = None) + enforce_temporal_uniqueness: bool : + (Default value = False) + id_col: Optional[str] : + (Default value = None) + + Returns + ------- + SupportedTemporalDataFrame Example usage: -------------- - .. code-block:: python - - import pandas as pd - from temporalscope.core.temporal_data_loader import TimeFrame - - df = pd.DataFrame( - { - "patient_id": [1, 1, 2, 2], - "time": ["2023-01-01", "2023-01-02", "2023-01-01", "2023-01-03"], - "value": [10, 20, 30, 40], - } - ) - - tf = TimeFrame( - df, - time_col="time", - target_col="value", - ) - sorted_df = tf.setup(df, time_col_conversion="datetime", enforce_temporal_uniqueness=True, id_col="patient_id") - print(sorted_df) - - This example is provided under the Apache License, Version 2.0, and is distributed "AS IS" without warranties or - conditions of any kind. Users should refer to the license for details. - - - .. note:: - - This method is designed to be idempotent, ensuring safe revalidation or reinitialization. - - The `time_col_conversion` parameter allows you to convert the `time_col` to a numeric or datetime type. - - Sorting is performed only if explicitly enabled via the `sort` parameter. - - While this method validates, converts, and sorts the DataFrame, it does not modify the TimeFrame's - internal state unless explicitly used within another method (e.g., `update_dataframe`). - - The `enforce_temporal_uniqueness` parameter can be set dynamically in this method, allowing - validation of temporal uniqueness to be turned on/off as needed. - - The `id_col` parameter can also be set dynamically, defining the scope of the temporal uniqueness validation. - - .. seealso:: - The `id_col` parameter enables validation of temporal uniqueness within each group's records, ensuring no duplicate - timestamps exist per group while allowing different groups to have events on the same dates. This is particularly - useful for multi-entity time series datasets (e.g., patient data, stock prices). Note: Users must check the Apache License - for the complete terms of use. This software is distributed "AS-IS" and may require adjustments for specific use cases. + ```python + import pandas as pd + from temporalscope.core.temporal_data_loader import TimeFrame + + df = pd.DataFrame( + { + "patient_id": [1, 1, 2, 2], + "time": ["2023-01-01", "2023-01-02", "2023-01-01", "2023-01-03"], + "value": [10, 20, 30, 40], + } + ) + + tf = TimeFrame( + df, + time_col="time", + target_col="value", + ) + sorted_df = tf.setup(df, time_col_conversion="datetime", enforce_temporal_uniqueness=True, id_col="patient_id") + print(sorted_df) + ``` + + Notes + ----- + - This method is designed to be idempotent, ensuring safe revalidation or reinitialization. + - The `time_col_conversion` parameter allows you to convert the `time_col` to a numeric or datetime type. + - Sorting is performed only if explicitly enabled via the `sort` parameter. + - While this method validates, converts, and sorts the DataFrame, it does not modify the TimeFrame's + internal state unless explicitly used within another method (e.g., `update_dataframe`). + - The `enforce_temporal_uniqueness` parameter can be set dynamically in this method, allowing + validation of temporal uniqueness to be turned on/off as needed. + - The `id_col` parameter can also be set dynamically, defining the scope of the temporal uniqueness validation. + - The `id_col` parameter enables validation of temporal uniqueness within each group's records, ensuring no duplicate + timestamps exist per group while allowing different groups to have events on the same dates. This is particularly + useful for multi-entity time series datasets (e.g., patient data, stock prices). Note: Users must check the Apache License + for the complete terms of use. This software is distributed "AS-IS" and may require adjustments for specific use cases. + Validated, converted, and optionally sorted DataFrame. + """ # Step 1: Basic validation self.validate_dataframe(df) @@ -577,55 +640,65 @@ def update_dataframe(self, df: SupportedTemporalDataFrame) -> None: workflow and ensure that they handle pre-processing to be compatible with downstream tasks. - :param df: New DataFrame to use - :type df: SupportedTemporalDataFrame + Parameters + ---------- + df : SupportedTemporalDataFrame Example Usage: -------------- - .. code-block:: python - - import polars as pl - from temporalscope.core.temporal_data_loader import TimeFrame - - # Initial TimeFrame setup - data = pl.DataFrame( - { - "time": pl.date_range(start="2021-01-01", periods=5, interval="1d"), - "target": range(5), - "feature": range(5), - } - ) - tf = TimeFrame( - data, - time_col="time", - target_col="target", - ascending=True, # Sort order set at initialization - sort=True, # Sort behavior set at initialization - ) - - # Update with new data - uses parameters from initialization - new_data = pl.DataFrame( - { - "time": pl.date_range(start="2021-01-06", periods=5, interval="1d"), - "target": range(5, 10), - "feature": range(5, 10), - } - ) - tf.update_dataframe(new_data) # Will use time_col="time", ascending=True, sort=True - - .. note:: - This method uses the parameters set during TimeFrame initialization: - - Uses the same time_col and target_col - - Maintains the same sort order (ascending/descending) - - Keeps the same sorting behavior (enabled/disabled) - - If you need to change these parameters, create a new TimeFrame instance - with the desired configuration. + ```python + import polars as pl + from temporalscope.core.temporal_data_loader import TimeFrame + + # Initial TimeFrame setup + data = pl.DataFrame( + { + "time": pl.date_range(start="2021-01-01", periods=5, interval="1d"), + "target": range(5), + "feature": range(5), + } + ) + tf = TimeFrame( + data, + time_col="time", + target_col="target", + ascending=True, # Sort order set at initialization + sort=True, # Sort behavior set at initialization + ) + + # Update with new data - uses parameters from initialization + new_data = pl.DataFrame( + { + "time": pl.date_range(start="2021-01-06", periods=5, interval="1d"), + "target": range(5, 10), + "feature": range(5, 10), + } + ) + tf.update_dataframe(new_data) # Will use time_col="time", ascending=True, sort=True + ``` + + Notes + ----- + This method uses the parameters set during TimeFrame initialization: + - Uses the same time_col and target_col + - Maintains the same sort order (ascending/descending) + - Keeps the same sorting behavior (enabled/disabled) + + If you need to change these parameters, create a new TimeFrame instance + with the desired configuration. + + See Also + -------- + - :class:`temporalscope.target_shifters.single_step.SingleStepTargetShifter` + - :class:`temporalscope.partition.padding.functional` + For handling target transformations and padding operations. + New DataFrame to use + df: SupportedTemporalDataFrame : + + Returns + ------- + None - .. seealso:: - - :class:`temporalscope.target_shifters.single_step.SingleStepTargetShifter` - - :class:`temporalscope.partition.padding.functional` - For handling target transformations and padding operations. """ self._df = self.setup(df, sort=True, ascending=self._ascending) @@ -633,8 +706,11 @@ def update_dataframe(self, df: SupportedTemporalDataFrame) -> None: def df(self) -> SupportedTemporalDataFrame: """Return the DataFrame in its current state. - :return: The DataFrame managed by the TimeFrame instance. - :rtype: SupportedTemporalDataFrame + Returns + ------- + SupportedTemporalDataFrame + The DataFrame managed by the TimeFrame instance. + """ return self._df @@ -642,8 +718,11 @@ def df(self) -> SupportedTemporalDataFrame: def mode(self) -> str: """Return the mode of the TimeFrame instance. - :return: The mode of operation, either `MODE_SINGLE_TARGET` or `MODE_MULTI_TARGET`. - :rtype: str + Returns + ------- + str + The mode of operation, either `MODE_SINGLE_TARGET` or `MODE_MULTI_TARGET`. + """ return self._mode @@ -651,8 +730,11 @@ def mode(self) -> str: def backend(self) -> str: """Return the backend of the TimeFrame instance. - :return: The backend of the DataFrame, either specified or inferred. - :rtype: str + Returns + ------- + str + The backend of the DataFrame, either specified or inferred. + """ return self._backend @@ -660,8 +742,11 @@ def backend(self) -> str: def ascending(self) -> bool: """Return the sort order of the TimeFrame instance. - :return: The sort order, True if ascending, False if descending. - :rtype: bool + Returns + ------- + bool + The sort order, True if ascending, False if descending. + """ return self._ascending @@ -675,28 +760,30 @@ def metadata(self) -> Dict[str, Any]: extensions, including multi-target workflows and integration with deep learning libraries like TensorFlow or PyTorch. - Example Usage: - -------------- - .. code-block:: python - - # Initialize a TimeFrame - tf = TimeFrame(df, time_col="time", target_col="value") - - # Add custom metadata - tf.metadata["description"] = "This dataset is for monthly sales forecasting" - tf.metadata["model_details"] = {"type": "LSTM", "framework": "TensorFlow"} - - # Access metadata - print(tf.metadata["description"]) # Output: "This dataset is for monthly sales forecasting" - - .. note:: - - This metadata container is designed following patterns seen in deep reinforcement - learning (DRL) libraries like Stable-Baselines3, where additional metadata is - stored alongside primary data structures for extensibility. - - In future releases, this will support multi-target workflows, enabling the storage - of processed tensor data for deep learning explainability (e.g., SHAP, LIME). + Examples + -------- + >>> # Initialize a TimeFrame + >>> tf = TimeFrame(df, time_col="time", target_col="value") + >>> ... + >>> # Add custom metadata + >>> tf.metadata["description"] = "This dataset is for monthly sales forecasting" + >>> tf.metadata["model_details"] = {"type": "LSTM", "framework": "TensorFlow"} + >>> ... + >>> # Access metadata + >>> print(tf.metadata["description"]) # Output: "This dataset is for monthly sales forecasting" + + Notes + ----- + This metadata container is designed following patterns seen in deep reinforcement + learning (DRL) libraries like Stable-Baselines3, where additional metadata is + stored alongside primary data structures for extensibility. + - In future releases, this will support multi-target workflows, enabling the storage + of processed tensor data for deep learning explainability (e.g., SHAP, LIME). + + Returns + ------- + Dict[str, Any] + Dictionary for storing metadata related to the TimeFrame. - :return: Dictionary for storing metadata related to the TimeFrame. - :rtype: Dict[str, Any] """ return self._metadata diff --git a/src/temporalscope/datasets/dataset_validator.py b/src/temporalscope/datasets/dataset_validator.py index a0a7bfc..58aec70 100644 --- a/src/temporalscope/datasets/dataset_validator.py +++ b/src/temporalscope/datasets/dataset_validator.py @@ -31,100 +31,71 @@ warranty of any kind. Engineering Design ------------------ +------------------ The validator follows a clear separation between validation configuration and execution, designed to work seamlessly with both TimeFrame and raw DataFrame inputs. -+----------------+-------------------------------------------------------------------+ -| Component | Description | -+----------------+-------------------------------------------------------------------+ -| fit() | Input validation phase that ensures: | -| | - Valid DataFrame type | -| | - Required columns present | -| | - Validation thresholds configured | -+----------------+-------------------------------------------------------------------+ -| transform() | Pure Narwhals validation phase that: | -| | - Uses backend-agnostic operations only | -| | - Performs configured validation checks | -| | - Returns detailed validation results | -+----------------+-------------------------------------------------------------------+ - +| Component | Description | +|-----------|-------------| +| `fit()` | Input validation phase that ensures:
- Valid DataFrame type
- Required columns present
- Validation thresholds configured | +| `transform()` | Pure Narwhals validation phase that:
- Uses backend-agnostic operations only
- Performs configured validation checks
- Returns detailed validation results | Backend-Specific Patterns ------------------------- +------------------------- The following table outlines key patterns for working with different DataFrame backends through Narwhals operations: -+----------------+-------------------------------------------------------------------+ -| Backend | Implementation Pattern | -+----------------+-------------------------------------------------------------------+ -| LazyFrame | Uses collect() for scalar access, handles lazy evaluation through | -| (Dask/Polars) | proper Narwhals operations, avoids direct indexing. | -+----------------+-------------------------------------------------------------------+ -| PyArrow | Uses nw.Int64 for numeric operations, handles comparisons through | -| | Narwhals, converts types before arithmetic operations. | -+----------------+-------------------------------------------------------------------+ -| All Backends | Uses pure Narwhals operations for validation checks, avoids any | -| | backend-specific code to ensure consistent behavior. | -+----------------+-------------------------------------------------------------------+ +| Backend | Implementation Pattern | +|---------|------------------------| +| LazyFrame (Dask/Polars) | Uses `collect()` for scalar access, handles lazy evaluation through proper Narwhals operations, avoids direct indexing. | +| PyArrow | Uses `nw.Int64` for numeric operations, handles comparisons through Narwhals, converts types before arithmetic operations. | +| All Backends | Uses pure Narwhals operations for validation checks, avoids any backend-specific code to ensure consistent behavior. | Research-Backed Thresholds -------------------------- +-------------------------- The following table summarizes validation thresholds derived from key research: -+------------------------+----------------------+---------------------------+--------------------------------+ -| Validation Check | Default Threshold | Source | Reasoning | -+------------------------+----------------------+---------------------------+--------------------------------+ -| Minimum Samples | ≥ 3,000 | Grinsztajn et al. (2022) | Ensures sufficient data for | -| | | | complex model training | -+------------------------+----------------------+---------------------------+--------------------------------+ -| Maximum Samples | ≤ 50,000 | Shwartz-Ziv et al. (2021) | Defines medium-sized dataset | -| | | | upper bound | -+------------------------+----------------------+---------------------------+--------------------------------+ -| Minimum Features | ≥ 4 | Shwartz-Ziv et al. (2021) | Ensures meaningful complexity | -| | | | for model learning | -+------------------------+----------------------+---------------------------+--------------------------------+ -| Maximum Features | < 500 | Gorishniy et al. (2021) | Avoids high-dimensional data | -| | | | challenges | -+------------------------+----------------------+---------------------------+--------------------------------+ -| Feature/Sample Ratio | d/n < 1/10 | Grinsztajn et al. (2022) | Prevents overfitting risk | -+------------------------+----------------------+---------------------------+--------------------------------+ -| Categorical Cardinality| ≤ 20 unique values | Grinsztajn et al. (2022) | Manages categorical feature | -| | | | complexity | -+------------------------+----------------------+---------------------------+--------------------------------+ -| Numerical Uniqueness | ≥ 10 unique values | Gorishniy et al. (2021) | Ensures sufficient feature | -| | | | variability | -+------------------------+----------------------+---------------------------+--------------------------------+ - -Example Usage ------------- -.. code-block:: python - - import pandas as pd - from temporalscope.datasets.dataset_validator import DatasetValidator +| Validation Check | Default Threshold | Source | Reasoning | +|-----------------|-------------------|--------|-----------| +| Minimum Samples | ≥ 3,000 | Grinsztajn et al. (2022) | Ensures sufficient data for complex model training | +| Maximum Samples | ≤ 50,000 | Shwartz-Ziv et al. (2021) | Defines medium-sized dataset upper bound | +| Minimum Features | ≥ 4 | Shwartz-Ziv et al. (2021) | Ensures meaningful complexity for model learning | +| Maximum Features | < 500 | Gorishniy et al. (2021) | Avoids high-dimensional data challenges | +| Feature/Sample Ratio | d/n < 1/10 | Grinsztajn et al. (2022) | Prevents overfitting risk | +| Categorical Cardinality | ≤ 20 unique values | Grinsztajn et al. (2022) | Manages categorical feature complexity | +| Numerical Uniqueness | ≥ 10 unique values | Gorishniy et al. (2021) | Ensures sufficient feature variability | + +Examples +-------- +```python +import pandas as pd +from temporalscope.datasets.dataset_validator import DatasetValidator + +# Create sample data +df = pd.DataFrame({"numeric_feature": range(100), "categorical_feature": ["A", "B"] * 50, "target": range(100)}) + +# Create validator with custom thresholds +validator = DatasetValidator( + min_samples=1000, max_samples=10000, checks_to_run=["sample_size", "feature_count"], enable_warnings=True +) + +# Run validation checks +results = validator.validate(df, target_col="target") + +# Print detailed report +validator.print_report(results) +``` + +Notes +----- +- Uses the scikit-learn-style fit/transform pattern but adapted for TemporalScope: + * fit() validates input DataFrame compatibility + * transform() is @nw.narwhalify'd for backend-agnostic operations +- This pattern is used throughout TemporalScope to ensure: + * Input validation happens in fit() + * All operations use Narwhals' backend-agnostic API in transform() +- Supports customizable thresholds for different domain requirements +- Integrates with data pipelines through scikit-learn compatible API - # Create sample data - df = pd.DataFrame({"numeric_feature": range(100), "categorical_feature": ["A", "B"] * 50, "target": range(100)}) - - # Create validator with custom thresholds - validator = DatasetValidator( - min_samples=1000, max_samples=10000, checks_to_run=["sample_size", "feature_count"], enable_warnings=True - ) - - # Run validation checks - results = validator.validate(df, target_col="target") - - # Print detailed report - validator.print_report(results) - -.. note:: - - Uses the scikit-learn-style fit/transform pattern but adapted for TemporalScope: - * fit() validates input DataFrame compatibility - * transform() is @nw.narwhalify'd for backend-agnostic operations - - This pattern is used throughout TemporalScope to ensure: - * Input validation happens in fit() - * All operations use Narwhals' backend-agnostic API in transform() - - Supports customizable thresholds for different domain requirements - - Integrates with data pipelines through scikit-learn compatible API """ import warnings @@ -146,32 +117,34 @@ class ValidationResult: data pipelines, logging systems, and monitoring dashboards. It includes methods for serialization and log formatting to support automated decision making in pipelines. - :param passed: Whether the check passed - :type passed: bool - :param message: Optional message explaining the result - :type message: Optional[str] - :param details: Optional dictionary with detailed results - :type details: Optional[Dict[str, Any]] - :param severity: Log level for the validation result (e.g., 'WARNING', 'ERROR') - :type severity: Optional[str] - - Example: - ------- - .. code-block:: python - - # In an Airflow DAG - def validate_dataframeset(**context): - validator = DatasetValidator() - results = validator.fit_transform(df) - - # Get structured results for logging - for check_name, result in results.items(): - log_entry = result.to_log_entry() - if not result.passed: - context["task_instance"].xcom_push(key=f"validation_failure_{check_name}", value=result.to_dict()) - - # Log to monitoring system - logger.log(level=log_entry["log_level"], msg=f"Validation check '{check_name}' failed", extra=log_entry) + Parameters + ---------- + passed : bool + Whether the check passed + message : Optional[str] + Optional message explaining the result + details : Optional[Dict[str, Any]] + Optional dictionary with detailed results + severity : Optional[str] + + Examples + -------- + ```python + # In an Airflow DAG + def validate_dataframeset(**context): + validator = DatasetValidator() + results = validator.fit_transform(df) + + # Get structured results for logging + for check_name, result in results.items(): + log_entry = result.to_log_entry() + if not result.passed: + context["task_instance"].xcom_push(key=f"validation_failure_{check_name}", value=result.to_dict()) + + # Log to monitoring system + logger.log(level=log_entry["log_level"], msg=f"Validation check '{check_name}' failed", extra=log_entry) + Log level for the validation result (e.g., 'WARNING', 'ERROR') + ``` """ @@ -197,10 +170,16 @@ def to_log_entry(self) -> Dict[str, Any]: def get_failed_checks(cls, results: Dict[str, "ValidationResult"]) -> Dict[str, "ValidationResult"]: """Get all failed validation checks for pipeline decision making. - :param results: Dictionary of validation results - :type results: Dict[str, ValidationResult] - :return: Dictionary of failed checks - :rtype: Dict[str, ValidationResult] + Parameters + ---------- + results : Dict[str, ValidationResult] + Dictionary of validation results + + Returns + ------- + Dict[str, ValidationResult] + Dictionary of failed checks + """ return {name: result for name, result in results.items() if not result.passed} @@ -208,10 +187,16 @@ def get_failed_checks(cls, results: Dict[str, "ValidationResult"]) -> Dict[str, def get_validation_summary(cls, results: Dict[str, "ValidationResult"]) -> Dict[str, Any]: """Get summary statistics for monitoring dashboards. - :param results: Dictionary of validation results - :type results: Dict[str, ValidationResult] - :return: Summary statistics - :rtype: Dict[str, Any] + Parameters + ---------- + results : Dict[str, ValidationResult] + Dictionary of validation results + + Returns + ------- + Dict[str, Any] + Summary statistics + """ return { "total_checks": len(results), @@ -229,92 +214,99 @@ class DatasetValidator: and raw DataFrames. Designed for integration into data pipelines and temporal workflows, it enables automated quality checks and monitoring. - Engineering Design Assumptions - ---------------------------- + Engineering Design Assumptions: + ------------------------------- 1. Input Validation: - - Supports all Narwhals-compatible DataFrame types - - Handles both eager and lazy evaluation patterns - - Validates column presence and types + - Supports all Narwhals-compatible DataFrame types + - Handles both eager and lazy evaluation patterns + - Validates column presence and types 2. Validation Checks: - - Each check is independent and configurable - - Uses pure Narwhals operations for backend compatibility - - Returns detailed results with messages and metrics + - Each check is independent and configurable + - Uses pure Narwhals operations for backend compatibility + - Returns detailed results with messages and metrics 3. Backend Compatibility: - - No direct DataFrame indexing or operations - - Handles LazyFrame evaluation properly - - Uses type-safe numeric operations + - No direct DataFrame indexing or operations + - Handles LazyFrame evaluation properly + - Uses type-safe numeric operations - Pipeline Integration Features - --------------------------- + Pipeline Integration Features: + ------------------------------ - Automated quality gates for pipeline decision making - Structured results for monitoring and alerting systems - Support for temporal workflow validation - :param min_samples: Minimum number of samples required, based on Grinsztajn et al. (2022) - :type min_samples: int - :param max_samples: Maximum number of samples allowed, based on Shwartz-Ziv et al. (2021) - :type max_samples: int - :param min_features: Minimum number of features required, based on Shwartz-Ziv et al. (2021) - :type min_features: int - :param max_features: Maximum number of features allowed, based on Gorishniy et al. (2021) - :type max_features: int - :param max_feature_ratio: Maximum feature-to-sample ratio, based on Grinsztajn et al. (2022) - :type max_feature_ratio: float - :param min_unique_values: Minimum unique values for numerical features - :type min_unique_values: int - :param max_categorical_values: Maximum unique values for categorical features - :type max_categorical_values: int - :param class_imbalance_threshold: Maximum ratio between largest and smallest classes - :type class_imbalance_threshold: float - :param checks_to_run: List of validation checks to run. If None, runs all checks. - :type checks_to_run: Optional[List[str]] - :param enable_warnings: Whether to show warning messages for failed checks - :type enable_warnings: bool - :raises ValueError: If invalid checks are specified - - Example with default thresholds: - ---------------------------- - .. code-block:: python + Attributes + ---------- + min_samples : int + Minimum number of samples required, based on Grinsztajn et al. (2022) + max_samples : int + Maximum number of samples allowed, based on Shwartz-Ziv et al. (2021) + min_features : int + Minimum number of features required, based on Shwartz-Ziv et al. (2021) + max_features : int + Maximum number of features allowed, based on Gorishniy et al. (2021) + max_feature_ratio : float + Maximum feature-to-sample ratio, based on Grinsztajn et al. (2022) + min_unique_values : int + Minimum unique values for numerical features + max_categorical_values : int + Maximum unique values for categorical features + class_imbalance_threshold : float + Maximum ratio between largest and smallest classes + checks_to_run : Optional[List[str]] + List of validation checks to run. If None, runs all checks. + enable_warnings : bool + Whether to show warning messages for failed checks + + + Raises + ------ + ValueError + If invalid checks are specified + + Examples + -------- + ```python + import pandas as pd + from temporalscope.datasets import DatasetValidator - import pandas as pd - from temporalscope.datasets import DatasetValidator + # Create sample data + df = pd.DataFrame({"feature1": range(5000), "target": range(5000)}) - # Create sample data - df = pd.DataFrame({"feature1": range(5000), "target": range(5000)}) + # Initialize and run validator + validator = DatasetValidator() + results = validator.fit_transform(df) + print(f"All checks passed: {all(r.passed for r in results.values())}") + ``` + + ```python + # In an Airflow DAG + def validate_dataframeset_task(**context): + validator = DatasetValidator(min_samples=1000, checks_to_run=["sample_size", "feature_count"]) - # Initialize and run validator - validator = DatasetValidator() results = validator.fit_transform(df) - print(f"All checks passed: {all(r.passed for r in results.values())}") - - Example Pipeline Integration: - ------------------------- - .. code-block:: python - - # In an Airflow DAG - def validate_dataframeset_task(**context): - validator = DatasetValidator(min_samples=1000, checks_to_run=["sample_size", "feature_count"]) - - results = validator.fit_transform(df) - failed = ValidationResult.get_failed_checks(results) - - if failed: - # Log failures and push metrics - metrics = ValidationResult.get_validation_summary(results) - monitoring.push_metrics("data_validation", metrics) - - # Fail pipeline if critical checks failed - if any(r.severity == "ERROR" for r in failed.values()): - raise AirflowException("Critical validation checks failed") - - .. note:: - Backend-Specific Patterns: - - Use collect() for scalar access (LazyFrame) - - Use nw.Int64 for numeric operations (PyArrow) - - Let @nw.narwhalify handle conversions - - Supports integration with workflow systems (Airflow, Prefect) + failed = ValidationResult.get_failed_checks(results) + + if failed: + # Log failures and push metrics + metrics = ValidationResult.get_validation_summary(results) + monitoring.push_metrics("data_validation", metrics) + + # Fail pipeline if critical checks failed + if any(r.severity == "ERROR" for r in failed.values()): + raise AirflowException("Critical validation checks failed") + ``` + + Notes + ----- + Backend-Specific Patterns: + - Use collect() for scalar access (LazyFrame) + - Use nw.Int64 for numeric operations (PyArrow) + - Let @nw.narwhalify handle conversions + - Supports integration with workflow systems (Airflow, Prefect) + """ # Available validation checks @@ -343,53 +335,63 @@ def __init__( checks_to_run: Optional[List[str]] = None, enable_warnings: bool = True, ): - """Initialize validator with column configuration and thresholds. - - This validator performs quality checks on single DataFrames, designed for integration - into automated pipelines (e.g., Airflow). It validates data quality using research-backed - thresholds while letting end users handle partitioning and parallelization. - - Engineering Design Assumptions: - 1. Single DataFrame Focus: - - Works on individual DataFrames - - End users handle partitioning/parallelization - - Suitable for pipeline integration - - 2. Basic Validation: - - Ensures time_col and target_col exist - - Validates numeric columns (except time_col) - - Checks for null values - - 3. Research-Backed Thresholds: - - Sample size (Grinsztajn et al. 2022) - - Feature counts (Shwartz-Ziv et al. 2021) - - Feature ratios (Gorishniy et al. 2021) - - :param time_col: Column representing time values - :type time_col: str - :param target_col: Column representing target variable - :type target_col: str - :param min_samples: Minimum samples required (Grinsztajn et al. 2022) - :type min_samples: int - :param max_samples: Maximum samples allowed (Shwartz-Ziv et al. 2021) - :type max_samples: int - :param min_features: Minimum features required (Shwartz-Ziv et al. 2021) - :type min_features: int - :param max_features: Maximum features allowed (Gorishniy et al. 2021) - :type max_features: int - :param max_feature_ratio: Maximum feature-to-sample ratio (Grinsztajn et al. 2022) - :type max_feature_ratio: float - :param min_unique_values: Minimum unique values for numerical features - :type min_unique_values: int - :param max_categorical_values: Maximum unique values for categorical features - :type max_categorical_values: int - :param class_imbalance_threshold: Maximum ratio between largest and smallest classes - :type class_imbalance_threshold: float - :param checks_to_run: List of validation checks to run - :type checks_to_run: Optional[List[str]] - :param enable_warnings: Whether to show warning messages - :type enable_warnings: bool - :raises ValueError: If invalid checks are specified + """ + Initialize the validator with column configuration and thresholds. + + This validator performs quality checks on single DataFrames, designed for + integration into automated pipelines (e.g., Airflow). It validates data quality + using research-backed thresholds while leaving partitioning and parallelization + to end users. + + Engineering Design Assumptions + ------------------------------- + 1. **Single DataFrame Focus**: + - Operates on individual DataFrames. + - Assumes end-users handle partitioning and parallelization. + - Designed for pipeline integration. + + 2. **Basic Validation**: + - Verifies the existence of `time_col` and `target_col`. + - Validates numeric columns (excluding `time_col`). + - Checks for null values. + + 3. **Research-Backed Thresholds**: + - Sample size thresholds (Grinsztajn et al., 2022). + - Feature counts (Shwartz-Ziv et al., 2021). + - Feature ratios (Gorishniy et al., 2021). + + Parameters + ---------- + time_col : str + Column representing time values. + target_col : str + Column representing the target variable. + min_samples : int + Minimum samples required (Grinsztajn et al., 2022). + max_samples : int + Maximum samples allowed (Shwartz-Ziv et al., 2021). + min_features : int + Minimum features required (Shwartz-Ziv et al., 2021). + max_features : int + Maximum features allowed (Gorishniy et al., 2021). + max_feature_ratio : float + Maximum feature-to-sample ratio (Grinsztajn et al., 2022). + min_unique_values : int + Minimum unique values required for numerical features. + max_categorical_values : int + Maximum unique values allowed for categorical features. + class_imbalance_threshold : float + Maximum ratio between the largest and smallest class sizes. + checks_to_run : Optional[List[str]] + List of validation checks to execute. + enable_warnings : bool + Whether to display warning messages. + + Raises + ------ + ValueError + If invalid checks are specified. + """ self.time_col = time_col self.target_col = target_col @@ -415,11 +417,21 @@ def __init__( def _ensure_narwhals_df(self, df: Union[SupportedTemporalDataFrame, FrameT]) -> FrameT: """Ensure DataFrame is Narwhals-compatible. - :param df: DataFrame to validate - :type df: Union[SupportedTemporalDataFrame, FrameT] - :return: Narwhals-compatible DataFrame - :rtype: FrameT - :raises TypeError: If input is not a valid temporal DataFrame + Parameters + ---------- + df : Union[SupportedTemporalDataFrame, FrameT] + DataFrame to validate + + Returns + ------- + FrameT + Narwhals-compatible DataFrame + + Raises + ------ + TypeError + If input is not a valid temporal DataFrame + """ is_valid, _ = is_valid_temporal_dataframe(df) if not is_valid: @@ -437,15 +449,21 @@ def _check_feature_variability(self, df: Union[SupportedTemporalDataFrame, Frame - Checks for null values to ensure data quality 3. Validates against minimum uniqueness threshold - :param df: DataFrame to validate - :type df: Union[SupportedTemporalDataFrame, FrameT] - :return: ValidationResult with: - - passed: Whether all features meet variability requirements - - message: Description of any issues found - - details: Dictionary containing: - * numeric_feature: Whether features are numeric - * {column_name}: Number of unique values for each feature - :rtype: ValidationResult + Parameters + ---------- + df : Union[SupportedTemporalDataFrame, FrameT] + DataFrame to validate + + Returns + ------- + ValidationResult + ValidationResult with: + - passed: Whether all features meet variability requirements + - message: Description of any issues found + - details: Dictionary containing: + - numeric_feature: Whether features are numeric + - column_name: Number of unique values for each feature + """ details: Dict[str, Any] = {"numeric_feature": True} @@ -503,21 +521,34 @@ def _check_class_balance( 1. Counting total samples 2. Adding class count information to details - :param df: DataFrame to validate - :type df: Union[SupportedTemporalDataFrame, FrameT] - :param target_col: Target column name - :type target_col: str - :return: ValidationResult with: - - passed: Always True (basic check) - - details: Dictionary containing: - * class_counts: Basic count information - :rtype: ValidationResult - - .. note:: - Implementation Details: - - Uses count() for backend-agnostic counting - - Handles LazyFrame evaluation through collect() - - Converts PyArrow scalars using as_py() + Parameters + ---------- + df : Union[SupportedTemporalDataFrame, FrameT] + DataFrame to validate + target_col : str + Target column name + df: Union[SupportedTemporalDataFrame : + + FrameT] : + + target_col: str : + + + Returns + ------- + ValidationResult + + Notes + ----- + Implementation Details: + - Uses count() for backend-agnostic counting + - Handles LazyFrame evaluation through collect() + - Converts PyArrow scalars using as_py() + ValidationResult with: + - passed: Always True (basic check) + - details: Dictionary containing: + - class_counts: Basic count information + """ if not target_col: return ValidationResult(True, "No target column specified") @@ -544,20 +575,31 @@ def _execute_check( ) -> Optional[ValidationResult]: """Execute a single validation check. - :param check_name: Name of the check to execute - :type check_name: str - :param df: DataFrame to validate - :type df: Union[SupportedTemporalDataFrame, FrameT] - :param target_col: Column name for target-specific checks - :type target_col: Optional[str] - :return: Result of the validation check if enabled, None otherwise - :rtype: Optional[ValidationResult] - :raises ValueError: If check_name is not a valid check name - - .. note:: - - Executes a single validation check based on check_name - - Returns None if check is not enabled - - Handles target-specific checks appropriately + Parameters + ---------- + check_name : str + Name of the check to execute + df : Union[SupportedTemporalDataFrame, FrameT] + DataFrame to validate + target_col : Optional[str] + Column name for target-specific checks + + Returns + ------- + Optional[ValidationResult] + Result of the validation check if enabled, None otherwise + + Raises + ------ + ValueError + If check_name is not a valid check name + + Notes + ----- + - Executes a single validation check based on check_name + - Returns None if check is not enabled + - Handles target-specific checks appropriately + """ if check_name not in self.checks_to_run: return None @@ -586,21 +628,28 @@ def _check_sample_size(self, df: Union[SupportedTemporalDataFrame, FrameT]) -> V 1. Counts total samples using backend-agnostic operations 2. Validates against configured minimum and maximum thresholds - :param df: DataFrame to validate - :type df: Union[SupportedTemporalDataFrame, FrameT] - :return: ValidationResult with: - - passed: Whether sample size is within acceptable range - - message: Description of any issues found - - details: Dictionary containing: - * num_samples: Total number of samples in dataset - :rtype: ValidationResult - - .. note:: - Implementation Details: - - Uses count() for backend-agnostic sample counting - - Handles LazyFrame evaluation through collect() - - Converts PyArrow scalars using as_py() - - Handles empty DataFrames gracefully + Parameters + ---------- + df : Union[SupportedTemporalDataFrame, FrameT] + DataFrame to validate + + Returns + ------- + ValidationResult + + Notes + ----- + Implementation Details: + - Uses count() for backend-agnostic sample counting + - Handles LazyFrame evaluation through collect() + - Converts PyArrow scalars using as_py() + - Handles empty DataFrames gracefully + ValidationResult with: + - passed: Whether sample size is within acceptable range + - message: Description of any issues found + - details: Dictionary containing: + - num_samples: Total number of samples in dataset + """ # Handle empty DataFrame if not df.columns: @@ -646,20 +695,26 @@ def _check_sample_size(self, df: Union[SupportedTemporalDataFrame, FrameT]) -> V @nw.narwhalify def _check_feature_count(self, df: Union[SupportedTemporalDataFrame, FrameT]) -> ValidationResult: - """Check if dataset meets feature count requirements. + """ + Validate if the dataset meets feature count requirements. - This method evaluates feature count through a simple process: - 1. Counts total features excluding time and target columns - 2. Validates against configured minimum and maximum thresholds + This method performs feature count validation using the following steps: + 1. Counts the total number of features, excluding the time and target columns. + 2. Verifies the count against the configured minimum and maximum thresholds. + + Parameters + ---------- + df : Union[SupportedTemporalDataFrame, FrameT] + The DataFrame to validate. - :param df: DataFrame to validate - :type df: Union[SupportedTemporalDataFrame, FrameT] - :return: ValidationResult with: - - passed: Whether feature count is within acceptable range - - message: Description of any issues found - - details: Dictionary containing: - * num_features: Total number of features in dataset - :rtype: ValidationResult + Returns + ------- + ValidationResult + An object containing the validation outcome, with the following attributes: + - passed (bool): Indicates whether the feature count is within the acceptable range. + - message (str): Describes any issues identified during validation. + - details (dict): Provides additional context with the following key: + - num_features (int): The total number of features in the dataset. """ df = self._ensure_narwhals_df(df) @@ -704,21 +759,28 @@ def _check_feature_ratio(self, df: Union[SupportedTemporalDataFrame, FrameT]) -> 2. Counts feature columns (excluding time and target) 3. Calculates ratio and validates against threshold - :param df: DataFrame to validate - :type df: Union[SupportedTemporalDataFrame, FrameT] - :return: ValidationResult with: - - passed: Whether ratio is within acceptable range - - message: Description of any issues found - - details: Dictionary containing: - * ratio: Feature-to-sample ratio (num_features/num_samples) - :rtype: ValidationResult - - .. note:: - Implementation Details: - - Uses count() for backend-agnostic sample counting - - Handles LazyFrame evaluation through collect() - - Converts PyArrow scalars using as_py() - - Only counts feature columns in ratio calculation + Parameters + ---------- + df : Union[SupportedTemporalDataFrame, FrameT] + DataFrame to validate + + Returns + ------- + ValidationResult + + Notes + ----- + Implementation Details: + - Uses count() for backend-agnostic sample counting + - Handles LazyFrame evaluation through collect() + - Converts PyArrow scalars using as_py() + - Only counts feature columns in ratio calculation + ValidationResult with: + - passed: Whether ratio is within acceptable range + - message: Description of any issues found + - details: Dictionary containing: + - ratio: Feature-to-sample ratio (num_features/num_samples) + """ # Handle empty DataFrame if not df.columns: @@ -778,10 +840,16 @@ def _get_feature_columns(self, df: Union[SupportedTemporalDataFrame, FrameT]) -> Features are defined as all columns except time_col and target_col. - :param df: DataFrame to get columns from - :type df: Union[SupportedTemporalDataFrame, FrameT] - :return: List of feature column names - :rtype: List[str] + Parameters + ---------- + df : Union[SupportedTemporalDataFrame, FrameT] + DataFrame to get columns from + + Returns + ------- + List[str] + List of feature column names + """ cols = df.columns if hasattr(cols, "collect"): @@ -798,24 +866,45 @@ def fit(self, df: Union[SupportedTemporalDataFrame, FrameT]) -> "DatasetValidato 1. Validates DataFrame type and required columns 2. Ensures numeric columns and checks for null values - :param df: DataFrame to validate - :type df: Union[SupportedTemporalDataFrame, FrameT] - :return: DatasetValidator instance for method chaining - :rtype: DatasetValidator - :raises TypeError: If input is not a valid temporal DataFrame - :raises ValueError: If columns are missing or invalid + Parameters + ---------- + df : Union[SupportedTemporalDataFrame, FrameT] + DataFrame to validate - Example: + Returns ------- - .. code-block:: python - validator = DatasetValidator(time_col="time", target_col="target") - validator.fit(df) + DatasetValidator + DatasetValidator instance for method chaining + + Raises + ------ + TypeError + If input is not a valid temporal DataFrame + ValueError + If columns are missing or invalid + + Examples + -------- + ```python + validator = DatasetValidator(time_col="time", target_col="target") + validator.fit(df) + ``` """ @nw.narwhalify def validate_numeric(df: Union[SupportedTemporalDataFrame, FrameT]) -> FrameT: - """Validate that all columns except time are numeric.""" + """Validate that all columns except time are numeric. + + Parameters + ---------- + df: Union[SupportedTemporalDataFrame, FrameT] : + + Returns + ------- + FrameT + + """ for col in df.columns: if col != self.time_col: try: @@ -826,7 +915,18 @@ def validate_numeric(df: Union[SupportedTemporalDataFrame, FrameT]) -> FrameT: @nw.narwhalify def check_nulls(df: Union[SupportedTemporalDataFrame, FrameT], columns: List[str]) -> Dict[str, int]: - """Check for null values in specified columns.""" + """Check for null values in specified columns. + + Parameters + ---------- + df: Union[SupportedTemporalDataFrame, FrameT] : + columns: List[str] + + Returns + ------- + Dict[str, int] + + """ null_counts = {} for col in columns: null_count = df.select([nw.col(col).is_null().sum().cast(nw.Int64).alias("nulls")]) @@ -873,36 +973,44 @@ def transform( ) -> Dict[str, ValidationResult]: """Run configured validation checks on the DataFrame. - :param df: DataFrame to validate - :type df: Union[SupportedTemporalDataFrame, FrameT] - :param target_col: Column name for target-specific checks - :type target_col: Optional[str] - :return: Dictionary of validation results for each check - :rtype: Dict[str, ValidationResult] + Parameters + ---------- + df : Union[SupportedTemporalDataFrame, FrameT] + DataFrame to validate + target_col : Optional[str] + Column name for target-specific checks + + target_col: Optional[str] : + (Default value = None) - Example: + Returns ------- - .. code-block:: python + Dict[str, ValidationResult] - import pandas as pd - from temporalscope.datasets import DatasetValidator + Examples + -------- + ```python + import pandas as pd + from temporalscope.datasets import DatasetValidator - # Create sample data - df = pd.DataFrame({"feature1": range(5000), "target": range(5000)}) + # Create sample data + df = pd.DataFrame({"feature1": range(5000), "target": range(5000)}) - # Initialize and run validator - validator = DatasetValidator() - validator.fit(df) - results = validator.transform(df, target_col="target") + # Initialize and run validator + validator = DatasetValidator() + validator.fit(df) + results = validator.transform(df, target_col="target") - # Check results - for check, result in results.items(): - print(f"{check}: {'Passed' if result.passed else 'Failed'}") + # Check results + for check, result in results.items(): + print(f"{check}: {'Passed' if result.passed else 'Failed'}") + ``` - .. note:: - - Uses pure Narwhals operations - - Handles LazyFrame evaluation - - Returns detailed results + Notes + ----- + - Uses pure Narwhals operations + - Handles LazyFrame evaluation + - Returns detailed results (Dictionary of validation results for each check) """ # Execute validation checks @@ -939,35 +1047,48 @@ def fit_transform( ) -> Dict[str, ValidationResult]: """Fit the validator and run validation checks in one step. - :param df: DataFrame to validate - :type df: Union[SupportedTemporalDataFrame, FrameT] - :param target_col: Column name for target-specific checks - :type target_col: Optional[str] - :return: Dictionary of validation results for each check - :rtype: Dict[str, ValidationResult] - :raises TypeError: If input is not convertible to a Narwhals DataFrame + Parameters + ---------- + df : Union[SupportedTemporalDataFrame, FrameT] + DataFrame to validate + target_col : Optional[str] + Column name for target-specific checks - Example: + target_col: Optional[str] : + (Default value = None) + + Returns ------- - .. code-block:: python + Dict[str, ValidationResult] + Dictionary of validation results for each check - import pandas as pd - from temporalscope.datasets import DatasetValidator + Raises + ------ + TypeError + If input is not convertible to a Narwhals DataFrame - # Create sample data - df = pd.DataFrame({"feature1": range(5000), "target": range(5000)}) + Examples + -------- + ```python + import pandas as pd + from temporalscope.datasets import DatasetValidator - # Initialize and run validator - validator = DatasetValidator() - results = validator.fit_transform(df, target_col="target") + # Create sample data + df = pd.DataFrame({"feature1": range(5000), "target": range(5000)}) - # Print report - validator.print_report(results) + # Initialize and run validator + validator = DatasetValidator() + results = validator.fit_transform(df, target_col="target") - .. note:: - - Combines fit() and transform() - - Validates input then runs checks - - Returns detailed results + # Print report + validator.print_report(results) + ``` + + Notes + ----- + - Combines fit() and transform() + - Validates input then runs checks + - Returns detailed results """ return self.fit(df).transform(df, target_col) @@ -979,8 +1100,15 @@ def print_report(self, results: Dict[str, ValidationResult]) -> None: For production use cases, use the structured results directly from the validation methods. - :param results: Dictionary of validation results to report - :type results: Dict[str, ValidationResult] + Parameters + ---------- + results : Dict[str, ValidationResult] + Dictionary of validation results to report + + Returns + ------- + None + """ rows = [] for check_name, result in results.items(): diff --git a/src/temporalscope/datasets/datasets.py b/src/temporalscope/datasets/datasets.py index 38c3d2f..8dfdf33 100644 --- a/src/temporalscope/datasets/datasets.py +++ b/src/temporalscope/datasets/datasets.py @@ -26,18 +26,18 @@ Utility for loading datasets with multi-backend support. This class simplifies dataset loading, enabling compatibility with multiple DataFrame backends (such as Pandas, Modin, Polars) for TemporalScope tutorials and examples. -Example Usage: --------------- -.. code-block:: python - - from temporalscope.datasets.datasets import DatasetLoader - - # Initialize with 'macrodata' dataset - dataset_loader = DatasetLoader("macrodata") - - # Load dataset with specified backend - data = dataset_loader.load_data(backend="polars") - print(data.head()) # Example access +Examples +-------- +```python +from temporalscope.datasets.datasets import DatasetLoader + +# Initialize with 'macrodata' dataset +dataset_loader = DatasetLoader("macrodata") + +# Load dataset with specified backend +data = dataset_loader.load_data(backend="polars") +print(data.head()) # Example access +``` """ from typing import Any, Tuple @@ -60,8 +60,11 @@ def _load_macrodata() -> Tuple[pd.DataFrame, str]: """Load and preprocess the macrodata dataset. - :return: Preprocessed DataFrame and default target column 'realgdp'. - :rtype: Tuple[pd.DataFrame, str] + Returns + ------- + Tuple[pd.DataFrame, str] + Preprocessed DataFrame and default target column 'realgdp'. + """ loaded_data = macrodata.load_pandas().data if loaded_data is None: @@ -86,18 +89,22 @@ class DatasetLoader: dataset_name : str Name of the dataset to load, as defined in AVAILABLE_DATASETS. - Methods - ------- - load_data(backend: str = "pandas") -> Any - Load the dataset and convert it to the specified backend format. """ def __init__(self, dataset_name: str = "macrodata") -> None: """Initialize DatasetLoader with a specified dataset. - :param dataset_name: Name of the dataset to load. Must be available in AVAILABLE_DATASETS. - :raises ValueError: If the specified dataset is not available. + Parameters + ---------- + dataset_name : str + Name of the dataset to load. Default is 'macrodata'. + + Raises + ------ + ValueError + if the specified dataset is not available. + """ if dataset_name not in AVAILABLE_DATASETS: raise ValueError( @@ -108,8 +115,11 @@ def __init__(self, dataset_name: str = "macrodata") -> None: def _load_dataset_and_target(self) -> Tuple[pd.DataFrame, str]: """Load the dataset and its target column. - :return: DataFrame and associated target column name. - :rtype: Tuple[pd.DataFrame, str] + Returns + ------- + Tuple[pd.DataFrame, str] + DataFrame and associated target column name. + """ print_divider() print(f"Loading dataset: '{self.dataset_name}'") @@ -123,11 +133,23 @@ def _load_dataset_and_target(self) -> Tuple[pd.DataFrame, str]: def load_data(self, backend: str = "pandas") -> Any: """Load the dataset and convert it to the specified backend format. - :param backend: Backend to convert the dataset to. Default is 'pandas'. - :type backend: str - :return: Dataset in the specified backend format. - :rtype: Backend-specific DataFrame type (e.g., pandas.DataFrame, modin.DataFrame, polars.DataFrame) - :raises ValueError: If the backend is unsupported. + Parameters + ---------- + backend : str + Backend to convert the dataset to. Default is 'pandas'. + backend: str : + (Default value = "pandas") + + Returns + ------- + Backend-specific DataFrame type (e.g., pandas.DataFrame, modin.DataFrame, polars.DataFrame) + Dataset in the specified backend format. + + Raises + ------ + ValueError + If the backend is unsupported. + """ # Validate and load the dataset in pandas format is_valid_temporal_backend(backend) diff --git a/src/temporalscope/datasets/synthetic_data_generator.py b/src/temporalscope/datasets/synthetic_data_generator.py index 8b597a0..66a77c8 100644 --- a/src/temporalscope/datasets/synthetic_data_generator.py +++ b/src/temporalscope/datasets/synthetic_data_generator.py @@ -35,65 +35,69 @@ - Multi-step mode: Produces input-output sequence data for sequence forecasting, where input sequences (`X`) and output sequences (`Y`) are handled as part of a unified dataset but with vectorized targets. -.. note:: - - **Batch size**: This package assumes no default batch size; batch size is typically managed by the data loader (e.g., - TensorFlow `DataLoader`, PyTorch `DataLoader`). The synthetic data generator provides the raw data structure, which is - then partitioned and batched as needed in downstream pipelines (e.g., after target shifting or partitioning). +Notes +----- +- **Batch size**: This package assumes no default batch size; batch size is typically managed by the data loader (e.g., + TensorFlow `DataLoader`, PyTorch `DataLoader`). The synthetic data generator provides the raw data structure, which is + then partitioned and batched as needed in downstream pipelines (e.g., after target shifting or partitioning). - - **TimeFrame and Target Shape**: The TemporalScope framework checks if the target is scalar or vector (sequence). The - generated data in multi-step mode follows a unified structure, with the target represented as a sequence in the same - DataFrame. This ensures compatibility with popular machine learning libraries that are compatible with SHAP, LIME, and - other explainability methods. +- **TimeFrame and Target Shape**: The TemporalScope framework checks if the target is scalar or vector (sequence). The + generated data in multi-step mode follows a unified structure, with the target represented as a sequence in the same + DataFrame. This ensures compatibility with popular machine learning libraries that are compatible with SHAP, LIME, and + other explainability methods. -.. seealso:: - For further details on the single-step and multi-step modes, refer to the core TemporalScope documentation on data handling. + +See Also +-------- +For further details on the single-step and multi-step modes, refer to the core TemporalScope documentation on data handling. Example Visualization: ---------------------- Here is a visual demonstration of the datasets generated for single-step and multi-step modes, including the shape of input (`X`) and target (`Y`) data compatible with most popular ML frameworks like TensorFlow, PyTorch, and SHAP. -Single-step mode: - +------------+------------+------------+------------+-----------+ - | time | feature_1 | feature_2 | feature_3 | target | - +============+============+============+============+===========+ - | 2023-01-01 | 0.15 | 0.67 | 0.89 | 0.33 | - +------------+------------+------------+------------+-----------+ - | 2023-01-02 | 0.24 | 0.41 | 0.92 | 0.28 | - +------------+------------+------------+------------+-----------+ - - Shape: - - `X`: (num_samples, num_features) - - `Y`: (num_samples, 1) # Scalar target for each time step - -Multi-step mode (with vectorized targets): - - +------------+------------+------------+------------+-------------+ - | time | feature_1 | feature_2 | feature_3 | target | - +============+============+============+============+=============+ - | 2023-01-01 | 0.15 | 0.67 | 0.89 | [0.3, 0.4] | - +------------+------------+------------+------------+-------------+ - | 2023-01-02 | 0.24 | 0.41 | 0.92 | [0.5, 0.6] | - +------------+------------+------------+------------+-------------+ - - Shape: - - `X`: (num_samples, num_features) - - `Y`: (num_samples, sequence_length) # Vectorized target for each input sequence - -Example Usage: --------------- -.. code-block:: python - - from temporalscope.core.core_utils import MODE_SINGLE_TARGET, MODE_MULTI_TARGET - from temporalscope.datasets.synthetic_data_generator import create_sample_data - - # Generating data for single-step mode - df = create_sample_data(num_samples=100, num_features=3, mode=MODE_SINGLE_TARGET) - print(df.head()) # Shows the generated data with features and a scalar target. - - # Generating data for multi-step mode - df = create_sample_data(num_samples=100, num_features=3, mode=MODE_MULTI_TARGET) - print(df.head()) # Shows the generated input sequence (`X`) and target sequence (`Y`). +**Single-step mode**: + +| Time | Feature 1 | Feature 2 | Feature 3 | Target | +|------------|-----------|-----------|-----------|---------| +| 2023-01-01 | 0.15 | 0.67 | 0.89 | 0.33 | +| 2023-01-02 | 0.24 | 0.41 | 0.92 | 0.28 | + + +Shape: + +- `X`: (num_samples, num_features) +- `Y`: (num_samples, 1) + + +**Multi-step mode (with vectorized targets)**: + +| Time | Feature 1 | Feature 2 | Feature 3 | Target | +|------------|-----------|-----------|-----------|-------------| +| 2023-01-01 | 0.15 | 0.67 | 0.89 | [0.3, 0.4] | +| 2023-01-02 | 0.24 | 0.41 | 0.92 | [0.5, 0.6] | + + +Shape: + +- `X`: (num_samples, num_features) +- `Y`: (num_samples, sequence_length) + +Examples +-------- +```python +from temporalscope.core.core_utils import MODE_SINGLE_TARGET, MODE_MULTI_TARGET +from temporalscope.datasets.synthetic_data_generator import create_sample_data + +# Generating data for single-step mode +df = create_sample_data(num_samples=100, num_features=3, mode=MODE_SINGLE_TARGET) +print(df.head()) # Shows the generated data with features and a scalar target. + +# Generating data for multi-step mode +df = create_sample_data(num_samples=100, num_features=3, mode=MODE_MULTI_TARGET) +print(df.head()) # Shows the generated input sequence (`X`) and target sequence (`Y`). +``` + """ import dask.dataframe as dd @@ -112,14 +116,29 @@ def _apply_nulls_nans_single_row(df: pd.DataFrame, feature_cols: list[str], with This is an internal utility function that operates on pandas DataFrames directly for efficiency and simplicity in null/nan application. The main function handles conversion to other backends. - :param df: Pandas DataFrame to modify (modified in-place) - :type df: pd.DataFrame - :param feature_cols: List of feature column names to apply nulls/nans to - :type feature_cols: list[str] - :param with_nulls: Whether to apply null values - :type with_nulls: bool - :param with_nans: Whether to apply NaN values (only if with_nulls is False) - :type with_nans: bool + Parameters + ---------- + df : pd.DataFrame + Pandas DataFrame to modify (modified in-place) + feature_cols : list[str] + List of feature column names to apply nulls/nans to + with_nulls : bool + Whether to apply null values + with_nans : bool + Whether to apply NaN values (only if with_nulls is False) + df: pd.DataFrame : + + feature_cols: list[str] : + + with_nulls: bool : + + with_nans: bool : + + + Returns + ------- + None + """ if with_nulls: df.iloc[0, df.columns.get_indexer(feature_cols)] = None @@ -147,20 +166,41 @@ def _apply_nulls_nans_multi_row( - Prevents overlap between null and nan rows - Uses random selection for realistic data generation - :param df: Pandas DataFrame to modify (modified in-place) - :type df: pd.DataFrame - :param feature_cols: List of feature column names to apply nulls/nans to - :type feature_cols: list[str] - :param with_nulls: Whether to apply null values - :type with_nulls: bool - :param with_nans: Whether to apply NaN values - :type with_nans: bool - :param null_percentage: Percentage of rows to contain null values (0.0 to 1.0) - :type null_percentage: float - :param nan_percentage: Percentage of rows to contain NaN values (0.0 to 1.0) - :type nan_percentage: float - :param num_samples: Total number of rows in the DataFrame - :type num_samples: int + Parameters + ---------- + df : pd.DataFrame + Pandas DataFrame to modify (modified in-place) + feature_cols : list[str] + List of feature column names to apply nulls/nans to + with_nulls : bool + Whether to apply null values + with_nans : bool + Whether to apply NaN values + null_percentage : float + Percentage of rows to contain null values (0.0 to 1.0) + nan_percentage : float + Percentage of rows to contain NaN values (0.0 to 1.0) + num_samples : int + Total number of rows in the DataFrame + df: pd.DataFrame : + + feature_cols: list[str] : + + with_nulls: bool : + + with_nans: bool : + + null_percentage: float : + + nan_percentage: float : + + num_samples: int : + + + Returns + ------- + None + """ null_indices = [] if with_nulls: @@ -196,39 +236,50 @@ def generate_synthetic_time_series( drop_time: bool = False, random_seed: int = RANDOM_SEED, ) -> SupportedTemporalDataFrame: - """Generate synthetic time series data with specified backend support and configurations. - - :param backend: The backend to use for the generated data. - :type backend: str - :param num_samples: Number of samples (rows) to generate in the time series data. - :type num_samples: int, optional - :param num_features: Number of feature columns to generate in addition to 'time' and 'target' columns. - :type num_features: int, optional - :param with_nulls: Introduces None values in feature columns if True. - :type with_nulls: bool, optional - :param with_nans: Introduces NaN values in feature columns if True. - :type with_nans: bool, optional - :param null_percentage: Percentage of rows to contain null values (0.0 to 1.0). Only used if with_nulls is True. - For datasets with few rows, ensures at least one row is affected if nulls are enabled. - For single-row datasets, nulls take precedence over NaNs if both are enabled. - :type null_percentage: float, optional - :param nan_percentage: Percentage of rows to contain NaN values (0.0 to 1.0). Only used if with_nans is True. - For datasets with few rows, ensures at least one row is affected if NaNs are enabled. - For single-row datasets, nulls take precedence over NaNs if both are enabled. - :type nan_percentage: float, optional - :param mode: Mode for data generation; currently only supports 'single_target'. - :type mode: str, optional - :param time_col_numeric: If True, 'time' column is numeric instead of datetime. - :type time_col_numeric: bool, optional - :param drop_time: If True, omits the time column from output DataFrame. - :type drop_time: bool, optional - :param random_seed: Seed for random number generation to ensure reproducible results. - :type random_seed: int, optional - - :return: DataFrame or Table in the specified backend containing synthetic data. - :rtype: SupportedTemporalDataFrame - - :raises ValueError: If unsupported backend, mode, or invalid parameters. + """ + Generate synthetic time series data with specified backend support and configurations. + + Parameters + ---------- + backend : str + The backend to use for the generated data. + num_samples : int, optional + Number of samples (rows) to generate in the time series data. Default is 100. + num_features : int, optional + Number of feature columns to generate in addition to 'time' and 'target' columns. Default is 3. + with_nulls : bool, optional + Whether to introduce None values in feature columns. Default is False. + with_nans : bool, optional + Whether to introduce NaN values in feature columns. Default is False. + null_percentage : float, optional + Percentage of rows to contain null values (0.0 to 1.0). Only used if `with_nulls` is True. + - For datasets with few rows, ensures at least one row is affected if nulls are enabled. + - For single-row datasets, nulls take precedence over NaNs if both are enabled. + Default is 0.05 (5%). + nan_percentage : float, optional + Percentage of rows to contain NaN values (0.0 to 1.0). Only used if `with_nans` is True. + - For datasets with few rows, ensures at least one row is affected if NaNs are enabled. + - For single-row datasets, nulls take precedence over NaNs if both are enabled. + Default is 0.05 (5%). + mode : str, optional + Mode for data generation. Currently, only 'single_target' is supported. Default is 'single_target'. + time_col_numeric : bool, optional + If True, the 'time' column is numeric instead of a datetime object. Default is False. + drop_time : bool, optional + If True, the time column is omitted from the output DataFrame. Default is False. + random_seed : int, optional + Seed for random number generation to ensure reproducible results. Default is `RANDOM_SEED`. + + Returns + ------- + SupportedTemporalDataFrame + DataFrame or table in the specified backend containing the generated synthetic data. + + Raises + ------ + ValueError + If an unsupported backend, mode, or invalid parameters are specified. + """ is_valid_temporal_backend(backend) diff --git a/docs/Makefile b/src/temporalscope/modeling/__init__.py similarity index 55% rename from docs/Makefile rename to src/temporalscope/modeling/__init__.py index eda8885..13a8339 100644 --- a/docs/Makefile +++ b/src/temporalscope/modeling/__init__.py @@ -14,24 +14,3 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. - -# Minimal makefile for Sphinx documentation -# - -# You can set these variables from the command line, and also -# from the environment for the first two. -SPHINXOPTS ?= -SPHINXBUILD ?= sphinx-build -SOURCEDIR = . -BUILDDIR = _build - -# Put it first so that "make" without argument is like "make help". -help: - @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) - -.PHONY: help Makefile - -# Catch-all target: route all unknown targets to Sphinx using the new -# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). -%: Makefile - @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/src/temporalscope/partition/base_protocol.py b/src/temporalscope/partition/base_protocol.py index a83bb22..ab03a4b 100644 --- a/src/temporalscope/partition/base_protocol.py +++ b/src/temporalscope/partition/base_protocol.py @@ -25,46 +25,28 @@ enough to accommodate both modes when multi-target support is added. Partitioning for modern XAI Time-Series Pipelines: --------------------------------------------- +-------------------------------------------------- Partitioning is foundational to modern time-series workflows. It ensures computational efficiency, robust validation, and interpretable insights. Key use cases include: -+----------------------------+-----------------------------------------------------------------------------------+ -| Aspect | Details | -+----------------------------+-----------------------------------------------------------------------------------+ -| Temporal Explainability | Facilitates feature importance analyses by segmenting data for localized | -| | SHAP/WindowSHAP metrics. | -+----------------------------+-----------------------------------------------------------------------------------+ -| Robust Evaluation | Respects temporal ordering in train-test splits, critical for time-series | -| | generalization. | -+----------------------------+-----------------------------------------------------------------------------------+ -| Scalability and Efficiency | Supports sliding windows, expanding windows, and fixed partitions with | -| | lazy-loading and backend compatibility for large-scale datasets. | -+----------------------------+-----------------------------------------------------------------------------------+ -| Workflow Flexibility | Supports both single-target and multi-target modes, enabling DataFrame | -| | operations and deep learning pipelines through flexible partitioning methods. | -+----------------------------+-----------------------------------------------------------------------------------+ +| Aspect | Details | +|--------|---------| +| Temporal Explainability | Facilitates feature importance analyses by segmenting data for localized SHAP/WindowSHAP metrics. | +| Robust Evaluation | Respects temporal ordering in train-test splits, critical for time-series generalization. | +| Scalability and Efficiency | Supports sliding windows, expanding windows, and fixed partitions with lazy-loading and backend compatibility for large-scale datasets. | +| Workflow Flexibility | Supports both single-target and multi-target modes, enabling DataFrame operations and deep learning pipelines through flexible partitioning methods. | Core Functionality: ------------------- The protocol defines four mandatory methods, ensuring a strict and consistent lifecycle across all partitioning implementations. Each method has a clear purpose and aligns with the goals of efficient partitioning: -+-----------------+-----------------------------------------------------------------------------------+ -| Method | Description | -+-----------------+-----------------------------------------------------------------------------------+ -| setup | Prepares and validates input data, ensuring compatibility with the chosen | -| | workflow (e.g., backend conversions, deduplication, parameter checks). | -+-----------------+-----------------------------------------------------------------------------------+ -| fit | Generates partition indices (row ranges) for datasets, supporting sliding | -| | windows, fixed-length, or expanding partitions. | -+-----------------+-----------------------------------------------------------------------------------+ -| transform | Applies the partition indices to retrieve specific data slices, ensuring | -| | memory-efficient operation using lazy evaluation techniques. | -+-----------------+-----------------------------------------------------------------------------------+ -| fit_transform | Combines `fit` and `transform` for eager workflows, directly producing | -| | partitioned data slices. | -+-----------------+-----------------------------------------------------------------------------------+ +| Method | Description | +|--------|-------------| +| `setup` | Prepares and validates input data, ensuring compatibility with the chosen workflow (e.g., backend conversions, deduplication, parameter checks). | +| `fit` | Generates partition indices (row ranges) for datasets, supporting sliding windows, fixed-length, or expanding partitions. | +| `transform` | Applies the partition indices to retrieve specific data slices, ensuring memory-efficient operation using lazy evaluation techniques. | +| `fit_transform` | Combines `fit` and `transform` for eager workflows, directly producing partitioned data slices. | Workflow Modes: --------------- @@ -84,16 +66,17 @@ The protocol is designed for extensibility, ensuring advanced workflows like multi-modal models, cross-frequency partitioning, or custom padding strategies can be integrated seamlessly. -.. seealso:: +See Also +-------- +1. Nayebi, A., Tipirneni, S., Reddy, C. K., et al. (2024). WindowSHAP: An efficient framework for + explaining time-series classifiers based on Shapley values. Journal of Biomedical Informatics. + DOI:10.1016/j.jbi.2023.104438. +2. Gu, X., See, K. W., Wang, Y., et al. (2021). The sliding window and SHAP theory—an improved system + with a long short-term memory network model for state of charge prediction in electric vehicles. + Energies, 14(12), 3692. DOI:10.3390/en14123692. +3. Van Ness, M., Shen, H., Wang, H., et al. (2023). Cross-Frequency Time Series Meta-Forecasting. + arXiv preprint arXiv:2302.02077. - 1. Nayebi, A., Tipirneni, S., Reddy, C. K., et al. (2024). WindowSHAP: An efficient framework for - explaining time-series classifiers based on Shapley values. Journal of Biomedical Informatics. - DOI:10.1016/j.jbi.2023.104438. - 2. Gu, X., See, K. W., Wang, Y., et al. (2021). The sliding window and SHAP theory—an improved system - with a long short-term memory network model for state of charge prediction in electric vehicles. - Energies, 14(12), 3692. DOI:10.3390/en14123692. - 3. Van Ness, M., Shen, H., Wang, H., et al. (2023). Cross-Frequency Time Series Meta-Forecasting. - arXiv preprint arXiv:2302.02077. """ # Ignore given that this is a protocol and does not require implementation. @@ -113,7 +96,10 @@ def setup(self) -> None: # pragma: no cover """Prepare and validate input data for partitioning. This method performs preprocessing and ensures the data is compatible - with the specific workflow. Example tasks include: + with the specific workflow. + + Example tasks include: + - Sorting and deduplication for DataFrame workflows. - Conversion to tensors or datasets for multi-target workflows. - Validation of partitioning parameters (e.g., `num_partitions`, `stride`). @@ -121,12 +107,21 @@ def setup(self) -> None: # pragma: no cover This step ensures consistency across partitioning methods and minimizes runtime errors in subsequent stages. - .. note:: - This method should be idempotent and isolated. While optional for - end-users, implementations must ensure it is executed internally - before partitioning begins. + Notes + ----- + This method should be idempotent and isolated. While optional for + end-users, implementations must ensure it is executed internally + before partitioning begins. + + Returns + ------- + None + + Raises + ------ + ValueError + If any required input or parameter is invalid. - :raises ValueError: If any required input or parameter is invalid. """ pass @@ -137,11 +132,15 @@ def fit(self) -> Iterator[Dict[str, Any]]: # pragma: no cover such as `num_partitions`, `window_size`, and `stride`. It utilizes a lazy generator pattern to ensure memory efficiency, especially for large datasets. - :return: Generator yielding partition indices structured as dictionaries. - :rtype: Iterator[Dict[str, Any]] + Returns + ------- + Iterator[Dict[str, Any]] + + Notes + ----- + This method does not perform slicing; it only computes and returns indices. + Generator yielding partition indices structured as dictionaries. - .. note:: - This method does not perform slicing; it only computes and returns indices. """ pass @@ -152,10 +151,16 @@ def transform(self) -> Iterator[Dict[str, Any]]: memory efficiency through lazy evaluation and supports various output formats depending on the workflow mode (e.g., DataFrame slices, tensors, or datasets). - :return: Generator yielding dictionaries containing partitioned data slices. - :rtype: Iterator[Dict[str, Any]] + Returns + ------- + Iterator[Dict[str, Any]] + Generator yielding dictionaries containing partitioned data slices. + + Raises + ------ + ValueError + If `fit` has not been called prior to `transform`. - :raises ValueError: If `fit` has not been called prior to `transform`. """ pass @@ -166,7 +171,10 @@ def fit_transform(self) -> Iterator[Dict[str, Any]]: # pragma: no cover single step. It is ideal for workflows requiring immediate access to partitioned data without intermediate steps. - :return: Generator yielding dictionaries containing partitioned data slices. - :rtype: Iterator[Dict[str, Any]] + Returns + ------- + Iterator[Dict[str, Any]] + Generator yielding dictionaries containing partitioned data slices. + """ pass diff --git a/src/temporalscope/partition/single_target/dynamic/__init__.py b/src/temporalscope/partition/single_target/dynamic/__init__.py index fae862f..635fe6e 100644 --- a/src/temporalscope/partition/single_target/dynamic/__init__.py +++ b/src/temporalscope/partition/single_target/dynamic/__init__.py @@ -41,21 +41,21 @@ dynamic algorithms are implemented in this module at this stage, the flexible architecture of TemporalScope allows users to integrate bespoke methods tailored to their specific domain requirements. -.. note:: - - Users are encouraged to leverage the TemporalPartitionerProtocol for building custom dynamic partitioning workflows - and refer to the foundational literature on dynamic partitioning techniques for guidance. - -.. seealso:: - - 1. Shah, A., DePavia, A., Hudson, N., Foster, I., & Stevens, R. (2024). - Causal Discovery over High-Dimensional Structured Hypothesis Spaces with Causal Graph Partitioning. - *arXiv preprint arXiv:2406.06348.* - 2. Nodoushan, A. N. (2023). Interpretability of Deep Learning Models for Time-Series Clinical Data. - (Doctoral dissertation, The University of Arizona). - 3. Saarela, M., & Podgorelec, V. (2024). Recent Applications of Explainable AI (XAI): A Systematic Literature Review. - *Applied Sciences, 14(19), 8884.* - 4. Nayebi, A., Tipirneni, S., Reddy, C. K., Foreman, B., & Subbian, V. (2023). - WindowSHAP: An efficient framework for explaining time-series classifiers based on Shapley values. - *Journal of Biomedical Informatics, 144, 104438.* +Notes +----- +Users are encouraged to leverage the TemporalPartitionerProtocol for building custom dynamic partitioning workflows +and refer to the foundational literature on dynamic partitioning techniques for guidance. + +See Also +-------- +1. Shah, A., DePavia, A., Hudson, N., Foster, I., & Stevens, R. (2024). + Causal Discovery over High-Dimensional Structured Hypothesis Spaces with Causal Graph Partitioning. + *arXiv preprint arXiv:2406.06348.* +2. Nodoushan, A. N. (2023). Interpretability of Deep Learning Models for Time-Series Clinical Data. + (Doctoral dissertation, The University of Arizona). +3. Saarela, M., & Podgorelec, V. (2024). Recent Applications of Explainable AI (XAI): A Systematic Literature Review. + *Applied Sciences, 14(19), 8884.* +4. Nayebi, A., Tipirneni, S., Reddy, C. K., Foreman, B., & Subbian, V. (2023). + WindowSHAP: An efficient framework for explaining time-series classifiers based on Shapley values. + *Journal of Biomedical Informatics, 144, 104438.* """ diff --git a/src/temporalscope/partition/single_target/padding/functional.py b/src/temporalscope/partition/single_target/padding/functional.py index 64e0a24..a8e4221 100644 --- a/src/temporalscope/partition/single_target/padding/functional.py +++ b/src/temporalscope/partition/single_target/padding/functional.py @@ -36,22 +36,20 @@ - Explicit checks ensure all columns are numeric and free of null or NaN values. -Examples: +Examples -------- -.. code-block:: python - - import pandas as pd - import numpy as np - from temporalscope.partition.padding.functional import zero_pad - - df = pd.DataFrame({"feature_1": [10, 20], "feature_2": [30, 40], "target": [50, 60]}) - padded_df = zero_pad(df, target_len=5, pad_value=0, padding="post") - print(padded_df) - -.. note:: - -Note: ----- +```python +import pandas as pd +import numpy as np +from temporalscope.partition.padding.functional import zero_pad + +df = pd.DataFrame({"feature_1": [10, 20], "feature_2": [30, 40], "target": [50, 60]}) +padded_df = zero_pad(df, target_len=5, pad_value=0, padding="post") +print(padded_df) +``` + +Notes +----- This module draws inspiration from industry-standard patterns, including: - TensorFlow's `TimeseriesGenerator` for its emphasis on preprocessing flexibility. - PyTorch's `Dataset` API for its focus on functional design and data transformations. @@ -59,30 +57,27 @@ Refer to the API documentation for further details on usage patterns and constraints. + DataFrame Evaluation Modes: -+--------+--------------------------------+--------------------------------+ -| Mode | Key Characteristics | Type Handling | -+--------+--------------------------------+--------------------------------+ -| Eager | - Immediate execution | - Use schema for types | -| | - Direct computation | - Get Narwhals types direct | -| | - Memory-bound ops | - Narwhals ops supported | -+--------+--------------------------------+--------------------------------+ -| Lazy | - Deferred execution | - Must use native dtype | -| | - Optimized planning | - Schema not supported | -| | - Large-scale data | - Native type ops required | -+--------+--------------------------------+--------------------------------+ +---------------------------- + +| Mode | Key Characteristics | Type Handling | +|------|---------------------|---------------| +| Eager | - Immediate execution
- Direct computation
- Memory-bound ops | - Use schema for types
- Get Narwhals types direct
- Narwhals ops supported | +| Lazy | - Deferred execution
- Optimized planning
- Large-scale data | - Must use native dtype
- Schema not supported
- Native type ops required | Critical Rules: +--------------- - Never mix eager/lazy operations - Use narwhals operations consistently, noting Dask requires special handling for concatenation - Convert to native format only when required - Maintain same mode in concatenations, using backend-specific methods when needed (e.g. dask.concat) -.. seealso:: - 1. Dwarampudi, M. and Reddy, N.V., 2019. Effects of padding on LSTMs and CNNs. arXiv preprint arXiv:1903.07288. - 2. Lafabregue, B., Weber, J., et al., 2022. End-to-end deep representation learning for time - series clustering: a comparative study. Data Mining and Knowledge Discovery. - +See Also +-------- +1. Dwarampudi, M. and Reddy, N.V., 2019. Effects of padding on LSTMs and CNNs. arXiv preprint arXiv:1903.07288. +2. Lafabregue, B., Weber, J., et al., 2022. End-to-end deep representation learning for time +series clustering: a comparative study. Data Mining and Knowledge Discovery. """ @@ -102,11 +97,31 @@ def mean_fill_pad( A simple padding function that extends a DataFrame to a target length by adding rows filled with each column's mean value. Handles both eager and lazy evaluation. - :param df: DataFrame to pad - :param target_len: Desired length after padding - :param padding: Where to add padding ('pre' or 'post') - :return: Padded DataFrame - :raises ValueError: If target_len <= current length or invalid padding direction + Parameters + ---------- + df : + DataFrame to pad + target_len : + Desired length after padding + padding : + Where to add padding ('pre' or 'post') + df: SupportedTemporalDataFrame : + + target_len: int : + + padding: str : + (Default value = "post") + + Returns + ------- + type + Padded DataFrame + + Raises + ------ + ValueError + If target_len <= current length or invalid padding direction + """ # Validate data quality first null_counts = check_dataframe_nulls_nans(df, df.columns) diff --git a/src/temporalscope/partition/single_target/static/__init__.py b/src/temporalscope/partition/single_target/static/__init__.py index 075c7dd..b363f88 100644 --- a/src/temporalscope/partition/single_target/static/__init__.py +++ b/src/temporalscope/partition/single_target/static/__init__.py @@ -41,21 +41,21 @@ This flexibility ensures the framework remains adaptable to diverse requirements and emerging techniques in time-series analysis. -.. note:: - - Users are encouraged to leverage the TemporalPartitionerProtocol for building custom static partitioning workflows - and refer to the foundational literature on partitioning techniques for guidance. - -.. seealso:: - - 1. Shah, A., DePavia, A., Hudson, N., Foster, I., & Stevens, R. (2024). - Causal Discovery over High-Dimensional Structured Hypothesis Spaces with Causal Graph Partitioning. - *arXiv preprint arXiv:2406.06348.* - 2. Nodoushan, A. N. (2023). Interpretability of Deep Learning Models for Time-Series Clinical Data. - (Doctoral dissertation, The University of Arizona). - 3. Saarela, M., & Podgorelec, V. (2024). Recent Applications of Explainable AI (XAI): A Systematic Literature Review. - *Applied Sciences, 14(19), 8884.* - 4. Nayebi, A., Tipirneni, S., Reddy, C. K., Foreman, B., & Subbian, V. (2023). - WindowSHAP: An efficient framework for explaining time-series classifiers based on Shapley values. - *Journal of Biomedical Informatics, 144, 104438.* +Notes +----- +Users are encouraged to leverage the TemporalPartitionerProtocol for building custom static partitioning workflows +and refer to the foundational literature on partitioning techniques for guidance. + +See Also +-------- +1. Shah, A., DePavia, A., Hudson, N., Foster, I., & Stevens, R. (2024). + Causal Discovery over High-Dimensional Structured Hypothesis Spaces with Causal Graph Partitioning. + *arXiv preprint arXiv:2406.06348.* +2. Nodoushan, A. N. (2023). Interpretability of Deep Learning Models for Time-Series Clinical Data. + (Doctoral dissertation, The University of Arizona). +3. Saarela, M., & Podgorelec, V. (2024). Recent Applications of Explainable AI (XAI): A Systematic Literature Review. + *Applied Sciences, 14(19), 8884.* +4. Nayebi, A., Tipirneni, S., Reddy, C. K., Foreman, B., & Subbian, V. (2023). + WindowSHAP: An efficient framework for explaining time-series classifiers based on Shapley values. + *Journal of Biomedical Informatics, 144, 104438.* """ diff --git a/src/temporalscope/partition/single_target/static/sliding_window.py b/src/temporalscope/partition/single_target/static/sliding_window.py index 34dcd68..d2631f9 100644 --- a/src/temporalscope/partition/single_target/static/sliding_window.py +++ b/src/temporalscope/partition/single_target/static/sliding_window.py @@ -36,34 +36,14 @@ Engineering Design: ------------------- -+-------------------------+-------------------------------------------------------+ -| Aspect | Description | -+-------------------------+-------------------------------------------------------+ -| Partial Temporal | Follows a universal model design; allows overlapping | -| Ordering | labels within partitions while leaving strict | -| | temporal ordering to the user. | -+-------------------------+-------------------------------------------------------+ -| Narwhals API | Leverages Narwhals backend for efficient operations. | -| | Users can switch between supported backends (e.g., | -| | Pandas, Polars, Modin) using core_utils. | -+-------------------------+-------------------------------------------------------+ -| Dataset Accessibility | Inspired by Dask and TensorFlow design. Provides | -| | hierarchical access via `partitions[index]["train"]`, | -| | `test`, `validation`, or `full` labels. | -+-------------------------+-------------------------------------------------------+ -| Lazy/Eager Execution | Narwhals backend supports lazy/eager evaluation; | -| | generator pattern ensures memory-efficient `fit` and | -| | `transform` operations. | -+-------------------------+-------------------------------------------------------+ -| Human-Centric Design | Combines human-readable labels (`train`, `test`) with | -| | indexing for scalable workflows, reducing cognitive | -| | overhead when handling large numbers of partitions. | -+-------------------------+-------------------------------------------------------+ -| Padding Control | Leaves padding decisions (e.g., zero-padding) to | -| | users while allowing configurable truncation. | -+-------------------------+-------------------------------------------------------+ - -.. note:: +| Aspect | Description | +|--------|-------------| +| Partial Temporal Ordering | Follows a universal model design; allows overlapping labels within partitions while leaving strict temporal ordering to the user. | +| Narwhals API | Leverages Narwhals backend for efficient operations. Users can switch between supported backends (e.g., Pandas, Polars, Modin) using core_utils. | +| Dataset Accessibility | Inspired by Dask and TensorFlow design. Provides hierarchical access via `partitions[index]["train"]`, `test`, `validation`, or `full` labels. | +| Lazy/Eager Execution | Narwhals backend supports lazy/eager evaluation; generator pattern ensures memory-efficient `fit` and `transform` operations. | +| Human-Centric Design | Combines human-readable labels (`train`, `test`) with indexing for scalable workflows, reducing cognitive overhead when handling large numbers of partitions. | +| Padding Control | Leaves padding decisions (e.g., zero-padding) to users while allowing configurable truncation. | Visualization: -------------- @@ -71,7 +51,8 @@ partitions. Each "X" represents a row included in the respective partition, based on the configured window size and stride. -| Time | Partition 1 | Partition 2 | Partition 3 | Partition 4 | Partition 5 | + +| Time | Partition 1 | Partition 2 | Partition 3 | Partition 4 | Partition 5 | |--------------|-------------|-------------|-------------|-------------|-------------| | 2021-01-01 | X | | | | | | 2021-01-02 | X | X | | | | @@ -80,26 +61,28 @@ | 2021-01-05 | | | | X | X | | 2021-01-06 | | | | | X | -.. seealso:: - - 1. Gu et al., 2021. The sliding window and SHAP theory applied to long - short-term memory networks for state of charge prediction. - 2. Pham et al., 2023. Speech emotion recognition using overlapping sliding - window and explainable neural networks. - 3. Van Zyl et al., 2024. Explainable AI for feature selection in time series - energy forecasting with Grad-CAM and SHAP. - 4. Bi et al., 2020. Prediction model for identifying methylation sites with - XGBoost and SHAP explainability. - 5. Zimmermann et al., 2022. Improving drift detection by monitoring SHAP - loss values in pattern recognition workflows. - 6. Li et al., 2022. Visualizing distributional shifts using SHAP in machine - learning models. - 7. Seiffer et al., 2021. Concept drift detection in manufacturing data with - SHAP for error prediction improvement. - 8. Haug et al., 2022. Change detection for local explainability in evolving - data streams. - 9. Zhao et al., 2020. Feature drift detection in evolving data streams with - database applications. + +See Also +-------- +1. Gu et al., 2021. The sliding window and SHAP theory applied to long + short-term memory networks for state of charge prediction. +2. Pham et al., 2023. Speech emotion recognition using overlapping sliding + window and explainable neural networks. +3. Van Zyl et al., 2024. Explainable AI for feature selection in time series + energy forecasting with Grad-CAM and SHAP. +4. Bi et al., 2020. Prediction model for identifying methylation sites with + XGBoost and SHAP explainability. +5. Zimmermann et al., 2022. Improving drift detection by monitoring SHAP + loss values in pattern recognition workflows. +6. Li et al., 2022. Visualizing distributional shifts using SHAP in machine + learning models. +7. Seiffer et al., 2021. Concept drift detection in manufacturing data with + SHAP for error prediction improvement. +8. Haug et al., 2022. Change detection for local explainability in evolving + data streams. +9. Zhao et al., 2020. Feature drift detection in evolving data streams with + database applications. + """ # from typing import Iterator, Optional diff --git a/src/temporalscope/partition/single_target/utils.py b/src/temporalscope/partition/single_target/utils.py index 7877897..943c035 100644 --- a/src/temporalscope/partition/single_target/utils.py +++ b/src/temporalscope/partition/single_target/utils.py @@ -34,17 +34,35 @@ def validate_percentages( This function ensures percentages are within the range [0, 1], computes missing values, and validates that their sum equals 1.0. - :param train_pct: Percentage of data allocated for training. - :type train_pct: float - :param test_pct: Percentage of data allocated for testing. - :type test_pct: Optional[float] - :param val_pct: Percentage of data allocated for validation. - :type val_pct: Optional[float] - :param precision: Tolerance for floating-point imprecision. Default is 1e-6. - :type precision: float - :return: Tuple of validated percentages (train_pct, test_pct, val_pct). - :rtype: Tuple[float, float, float] - :raises ValueError: If percentages are invalid or do not sum to 1.0. + Parameters + ---------- + train_pct : float + Percentage of data allocated for training. + test_pct : Optional[float] + Percentage of data allocated for testing. + val_pct : Optional[float] + Percentage of data allocated for validation. + precision : float + Tolerance for floating-point imprecision. Default is 1e-6. + train_pct: float : + + test_pct: Optional[float] : + + val_pct: Optional[float] : + + precision: float : + (Default value = 1e-6) + + Returns + ------- + Tuple[float, float, float] + Tuple of validated percentages (train_pct, test_pct, val_pct). + + Raises + ------ + ValueError + If percentages are invalid or do not sum to 1.0. + """ if not (0 <= train_pct <= 1): raise ValueError("`train_pct` must be between 0 and 1.") @@ -86,18 +104,36 @@ def determine_partition_scheme( This function calculates `num_partitions` or `window_size` based on the dataset size. - :param num_partitions: Number of partitions, optional. - :type num_partitions: Optional[int] - :param window_size: Size of each partition, optional. - :type window_size: Optional[int] - :param total_rows: Total number of rows in the dataset. - :type total_rows: int - :param stride: Number of rows to skip between partitions. Defaults to `window_size`. - :type stride: Optional[int] - :return: Tuple containing the partition scheme ("num_partitions" or "window_size"), - the determined number of partitions, and window size. - :rtype: Tuple[str, int, int] - :raises ValueError: If both `num_partitions` and `window_size` are invalid. + Parameters + ---------- + num_partitions : Optional[int] + Number of partitions, optional. + window_size : Optional[int] + Size of each partition, optional. + total_rows : int + Total number of rows in the dataset. + stride : Optional[int] + Number of rows to skip between partitions. Defaults to `window_size`. + num_partitions: Optional[int] : + + window_size: Optional[int] : + + total_rows: int : + + stride: Optional[int] : + + + Returns + ------- + Tuple[str, int, int] + Tuple containing the partition scheme ("num_partitions" or "window_size"), + the determined number of partitions, and window size. + + Raises + ------ + ValueError + If both `num_partitions` and `window_size` are invalid. + """ if num_partitions is None and window_size is None: raise ValueError("Either `num_partitions` or `window_size` must be specified.") @@ -122,13 +158,30 @@ def determine_partition_scheme( def validate_cardinality(num_partitions: int, window_size: int, total_rows: int) -> None: """Validate dataset cardinality for the partitioning configuration. - :param num_partitions: Number of partitions. - :type num_partitions: int - :param window_size: Size of each partition. - :type window_size: int - :param total_rows: Total number of rows in the dataset. - :type total_rows: int - :raises ValueError: If dataset cardinality is insufficient for the configuration. + Parameters + ---------- + num_partitions : int + Number of partitions. + window_size : int + Size of each partition. + total_rows : int + Total number of rows in the dataset. + num_partitions: int : + + window_size: int : + + total_rows: int : + + + Returns + ------- + None + + Raises + ------ + ValueError + If dataset cardinality is insufficient for the configuration. + """ if num_partitions > total_rows: raise ValueError(f"Insufficient rows ({total_rows}) for `num_partitions={num_partitions}`.") @@ -143,9 +196,22 @@ def print_config(config: dict) -> None: (`int`, `float`, `bool`, `str`). It raises an error for any invalid types and then prints the configuration as a table. - :param config: Configuration dictionary with parameter names as keys and their values. - :type config: dict - :raises TypeError: If any value in the config dictionary is not an allowed type. + Parameters + ---------- + config : dict + Configuration dictionary with parameter names as keys and their values. + config: dict : + + + Returns + ------- + None + + Raises + ------ + TypeError + If any value in the config dictionary is not an allowed type. + """ # Allowed data types for config values allowed_types = (int, float, bool, str) diff --git a/src/temporalscope/target_shifters/single_step.py b/src/temporalscope/target_shifters/single_step.py index e3e4e8a..fbf5048 100644 --- a/src/temporalscope/target_shifters/single_step.py +++ b/src/temporalscope/target_shifters/single_step.py @@ -22,72 +22,56 @@ different DataFrame backends through Narwhals. Following the same backend-agnostic design as core_utils.py and temporal_data_loader.py, it ensures consistent behavior across all supported DataFrame types. -Engineering Design ------------------- +Engineering Design: +------------------- The SingleStepTargetShifter follows a clear separation between validation and transformation phases, designed to work seamlessly with both TimeFrame and raw DataFrame inputs. -+----------------+-------------------------------------------------------------------+ -| Component | Description | -+----------------+-------------------------------------------------------------------+ -| fit() | Input validation phase that ensures: | -| | - Valid TimeFrame or supported DataFrame type | -| | - Target column is set or can be inferred | -| | - No Narwhals operations at this stage | -+----------------+-------------------------------------------------------------------+ -| transform() | Pure Narwhals transformation phase that: | -| | - Uses backend-agnostic operations only | -| | - Shifts target using Narwhals operations | -| | - Preserves TimeFrame metadata if present | -+----------------+-------------------------------------------------------------------+ - -Backend-Specific Patterns ------------------------- + +| Component | Description | +|-----------|-------------| +| `fit()` | Input validation phase that ensures:
- Valid TimeFrame or supported DataFrame type
- Target column is set or can be inferred
- No Narwhals operations at this stage | +| `transform()` | Pure Narwhals transformation phase that:
- Uses backend-agnostic operations only
- Shifts target using Narwhals operations
- Preserves TimeFrame metadata if present | + +Backend-Specific Patterns: +-------------------------- The following table outlines key patterns for working with different DataFrame backends through Narwhals operations: -+----------------+-------------------------------------------------------------------+ -| Backend | Implementation Pattern | -+----------------+-------------------------------------------------------------------+ -| LazyFrame | Represents lazy evaluation in Dask and Polars. Use collect() for | -| (Dask/Polars) | scalar access, avoid direct indexing, and handle lazy evaluation | -| | through proper Narwhals operations. | -+----------------+-------------------------------------------------------------------+ -| PyArrow | Handles scalar operations differently. Use nw.Int64 for | -| | numeric operations, handle comparisons through Narwhals, and | -| | convert types before arithmetic operations. | -+----------------+-------------------------------------------------------------------+ -| All Backends | Let @nw.narwhalify handle conversions between backends. Use pure | -| | Narwhals operations and avoid any backend-specific code to ensure | -| | consistent behavior across all supported types. | -+----------------+-------------------------------------------------------------------+ - -Example Usage ------------- -.. code-block:: python - import pandas as pd - from temporalscope.target_shifters.single_step import SingleStepTargetShifter - from temporalscope.core.temporal_data_loader import TimeFrame +| Backend | Implementation Pattern | +|---------|------------------------| +| LazyFrame (Dask/Polars) | Represents lazy evaluation in Dask and Polars. Use `collect()` for scalar access, avoid direct indexing, and handle lazy evaluation through proper Narwhals operations. | +| PyArrow | Handles scalar operations differently. Use `nw.Int64` for numeric operations, handle comparisons through Narwhals, and convert types before arithmetic operations. | +| All Backends | Let `@nw.narwhalify` handle conversions between backends. Use pure Narwhals operations and avoid any backend-specific code to ensure consistent behavior across all supported types. | - # With TimeFrame - df = pd.DataFrame({"time": range(10), "target": range(10), "feature": range(10)}) - tf = TimeFrame(df=df, time_col="time", target_col="target") - shifter = SingleStepTargetShifter(n_lags=1) - transformed_tf = shifter.fit_transform(tf) - - # With DataFrame - df = pd.DataFrame({"target": range(10), "feature": range(10)}) - shifter = SingleStepTargetShifter(target_col="target", n_lags=1) - transformed_df = shifter.fit_transform(df) +Examples +-------- +```python +import pandas as pd +from temporalscope.target_shifters.single_step import SingleStepTargetShifter +from temporalscope.core.temporal_data_loader import TimeFrame -.. note:: - - Uses a familiar fit/transform pattern for consistency, while implementing - all operations through Narwhals' backend-agnostic API - - Currently implements single-step prediction only. For multi-step sequence prediction, - see the planned MultiStepTargetShifter in temporalscope.target_shifters.multi_step - - When validating DataFrames, must get native format first since Narwhals wraps - but does not implement actual DataFrame types +# With TimeFrame +df = pd.DataFrame({"time": range(10), "target": range(10), "feature": range(10)}) +tf = TimeFrame(df=df, time_col="time", target_col="target") +shifter = SingleStepTargetShifter(n_lags=1) +transformed_tf = shifter.fit_transform(tf) + +# With DataFrame +df = pd.DataFrame({"target": range(10), "feature": range(10)}) +shifter = SingleStepTargetShifter(target_col="target", n_lags=1) +transformed_df = shifter.fit_transform(df) +``` + +Notes +----- +- Uses a familiar fit/transform pattern for consistency, while implementing + all operations through Narwhals' backend-agnostic API +- Currently implements single-step prediction only. For multi-step sequence prediction, + see the planned MultiStepTargetShifter in temporalscope.target_shifters.multi_step +- When validating DataFrames, must get native format first since Narwhals wraps + but does not implement actual DataFrame types """ from typing import Optional, Union @@ -111,8 +95,8 @@ class SingleStepTargetShifter: working with both TimeFrame objects and raw DataFrames through Narwhals' backend-agnostic operations. - Engineering Design Assumptions - ---------------------------- + Engineering Design Assumptions: + ------------------------------- 1. Separation of Concerns: - fit: Validates inputs and sets parameters - transform: Pure Narwhals operations for shifting @@ -134,53 +118,60 @@ class SingleStepTargetShifter: - DataFrame: Validates in fit - numpy array: Converts in fit - :param target_col: Column name to shift (optional, can be inferred from TimeFrame) - :type target_col: str, optional - :param n_lags: Number of steps to shift target, must be > 0 - :type n_lags: int - :param drop_target: Whether to remove original target column - :type drop_target: bool - :param verbose: Enable progress/debug logging - :type verbose: bool - :param mode: Operation mode, defaults to single-step - :type mode: str - :raises ValueError: If n_lags ≤ 0 - - Example with TimeFrame: - -------------------- - .. code-block:: python - - import pandas as pd - from temporalscope.target_shifters.single_step import SingleStepTargetShifter - from temporalscope.core.temporal_data_loader import TimeFrame + Attributes + ---------- + target_col : str, optional + Column name to shift (optional, can be inferred from TimeFrame) + n_lags : int + Number of steps to shift target, must be > 0 + drop_target : bool + Whether to remove original target column + verbose : bool + Enable progress/debug logging + mode : str + Operation mode, defaults to single-step + + + Raises + ------ + ValueError + If n_lags ≤ 0 + + Examples + -------- + ```python + import pandas as pd + from temporalscope.target_shifters.single_step import SingleStepTargetShifter + from temporalscope.core.temporal_data_loader import TimeFrame - # Create TimeFrame - df = pd.DataFrame({"time": range(5), "target": range(5), "feature": range(5)}) - tf = TimeFrame(df=df, time_col="time", target_col="target") + # Create TimeFrame + df = pd.DataFrame({"time": range(5), "target": range(5), "feature": range(5)}) + tf = TimeFrame(df=df, time_col="time", target_col="target") - # Initialize and transform - shifter = SingleStepTargetShifter(n_lags=1) - transformed_tf = shifter.fit_transform(tf) + # Initialize and transform + shifter = SingleStepTargetShifter(n_lags=1) + transformed_tf = shifter.fit_transform(tf) + ``` - Example with DataFrame: - ------------------- - .. code-block:: python + ```python + import pandas as pd + from temporalscope.target_shifters.single_step import SingleStepTargetShifter - import pandas as pd - from temporalscope.target_shifters.single_step import SingleStepTargetShifter + # Create DataFrame + df = pd.DataFrame({"target": range(5), "feature": range(5)}) - # Create DataFrame - df = pd.DataFrame({"target": range(5), "feature": range(5)}) + # Initialize and transform + shifter = SingleStepTargetShifter(target_col="target") + transformed_df = shifter.fit_transform(df) + ``` - # Initialize and transform - shifter = SingleStepTargetShifter(target_col="target") - transformed_df = shifter.fit_transform(df) + Notes + ----- + Backend-Specific Patterns: + - Use collect() for scalar access (LazyFrame) + - Use nw.Int64 for scalar operations (PyArrow) + - Let @nw.narwhalify handle conversions - .. note:: - Backend-Specific Patterns: - - Use collect() for scalar access (LazyFrame) - - Use nw.Int64 for scalar operations (PyArrow) - - Let @nw.narwhalify handle conversions """ def __init__( @@ -207,18 +198,27 @@ def __init__( def _get_scalar_value(self, result, column: str) -> int: """Helper method to get scalar value from different DataFrame backends. - :param result: DataFrame result containing scalar value - :param column: Column name containing the scalar value - :return: Python integer value - :rtype: int - - .. note:: - Handles different DataFrame backend scalar access: - - Uses collect() for LazyFrame values - - Converts PyArrow scalars to Python int - - Returns native Python int for all cases - - No @nw.narwhalify needed as it handles scalar values after DataFrame operations, - not DataFrame operations themselves + Parameters + ---------- + result : + DataFrame result containing scalar value + column : + Column name containing the scalar value + column: str : + + + Returns + ------- + int + + Notes + ----- + Handles different DataFrame backend scalar access: + - Uses collect() for LazyFrame values + - Converts PyArrow scalars to Python int + - Returns native Python int for all cases + - No @nw.narwhalify needed as it handles scalar values after DataFrame operations, not DataFrame operations themselves Python integer value + """ if hasattr(result, "collect"): # For LazyFrame value = result.collect()[column][0] @@ -235,19 +235,29 @@ def _get_scalar_value(self, result, column: str) -> int: def _get_row_count(self, df: SupportedTemporalDataFrame, check_empty: bool = True) -> int: """Get row count using Narwhals operations. - :param df: DataFrame to count rows for - :type df: SupportedTemporalDataFrame - :param check_empty: Whether to raise "Cannot transform empty DataFrame" error - :type check_empty: bool - :return: Number of rows in DataFrame - :rtype: int - - .. note:: - Uses Narwhals operations for backend-agnostic row counting: - - nw.Int64 for scalar type conversion - - collect() for scalar access - - Handles LazyFrame and PyArrow scalars - - Controls empty DataFrame error handling based on check_empty parameter + Parameters + ---------- + df : SupportedTemporalDataFrame + DataFrame to count rows for + check_empty : bool + Whether to raise "Cannot transform empty DataFrame" error + df: SupportedTemporalDataFrame : + + check_empty: bool : + (Default value = True) + + Returns + ------- + int + + Notes + ----- + Uses Narwhals operations for backend-agnostic row counting: + - nw.Int64 for scalar type conversion + - collect() for scalar access + - Handles LazyFrame and PyArrow scalars + - Controls empty DataFrame error handling based on check_empty parameter Number of rows in DataFrame + """ try: result = df.select([nw.col(df.columns[0]).count().cast(nw.Int64).alias("count")]) @@ -263,16 +273,24 @@ def _get_row_count(self, df: SupportedTemporalDataFrame, check_empty: bool = Tru def _shift_target(self, df: SupportedTemporalDataFrame) -> SupportedTemporalDataFrame: """Shift target column using Narwhals operations. - :param df: DataFrame to transform - :type df: SupportedTemporalDataFrame - :return: DataFrame with shifted target - :rtype: SupportedTemporalDataFrame + Parameters + ---------- + df : SupportedTemporalDataFrame + DataFrame to transform + df: SupportedTemporalDataFrame : + + + Returns + ------- + SupportedTemporalDataFrame + + Notes + ----- + Uses Narwhals operations for backend-agnostic shifting: + - with_columns() for adding shifted column + - filter() for removing null values + - drop() for removing original target DataFrame with shifted target - .. note:: - Uses Narwhals operations for backend-agnostic shifting: - - with_columns() for adding shifted column - - filter() for removing null values - - drop() for removing original target """ if self.target_col not in df.columns: raise ValueError("target_col must be set before transform (call fit first)") @@ -299,49 +317,61 @@ def fit(self, X: Union[TimeFrame, SupportedTemporalDataFrame, np.ndarray], y=Non - DataFrame: Validates using is_valid_temporal_dataframe - numpy array: Converts to DataFrame first - :param X: Input data to validate - :type X: Union[TimeFrame, SupportedTemporalDataFrame, np.ndarray] - :param y: Ignored, exists for scikit-learn compatibility - :return: self - :rtype: SingleStepTargetShifter - :raises ValueError: If target_col not set and cannot be inferred - :raises TypeError: If input type is not supported - - Example with TimeFrame: - -------------------- - .. code-block:: python - - import pandas as pd - from temporalscope.target_shifters.single_step import SingleStepTargetShifter - from temporalscope.core.temporal_data_loader import TimeFrame - - # Create TimeFrame - df = pd.DataFrame({"time": range(5), "target": range(5), "feature": range(5)}) - tf = TimeFrame(df=df, time_col="time", target_col="target") - - # Initialize and fit - shifter = SingleStepTargetShifter(n_lags=1) - shifter.fit(tf) - - Example with DataFrame: - ------------------- - .. code-block:: python - - import pandas as pd - from temporalscope.target_shifters.single_step import SingleStepTargetShifter - - # Create DataFrame - df = pd.DataFrame({"target": range(5), "feature": range(5)}) - - # Initialize and fit - shifter = SingleStepTargetShifter(target_col="target") - shifter.fit(df) - - .. note:: - Input Validation: - - No Narwhals operations in fit() - - Validates before any transformations - - Handles all input types consistently + Parameters + ---------- + X : Union[TimeFrame, SupportedTemporalDataFrame, np.ndarray] + Input data to validate + y : + Ignored, exists for scikit-learn compatibility (Default value = None) + X: Union[TimeFrame, SupportedTemporalDataFrame, np.ndarray] : + + Returns + ------- + SingleStepTargetShifter + self + + Raises + ------ + ValueError + If target_col not set and cannot be inferred + TypeError + If input type is not supported + + Examples + -------- + ```python + import pandas as pd + from temporalscope.target_shifters.single_step import SingleStepTargetShifter + from temporalscope.core.temporal_data_loader import TimeFrame + + # Create TimeFrame + df = pd.DataFrame({"time": range(5), "target": range(5), "feature": range(5)}) + tf = TimeFrame(df=df, time_col="time", target_col="target") + + # Initialize and fit + shifter = SingleStepTargetShifter(n_lags=1) + shifter.fit(tf) + ``` + + ```python + import pandas as pd + from temporalscope.target_shifters.single_step import SingleStepTargetShifter + + # Create DataFrame + df = pd.DataFrame({"target": range(5), "feature": range(5)}) + + # Initialize and fit + shifter = SingleStepTargetShifter(target_col="target") + shifter.fit(df) + ``` + + Notes + ----- + Input Validation: + - No Narwhals operations in fit() + - Validates before any transformations + - Handles all input types consistently + """ if isinstance(X, TimeFrame): self.target_col = X._target_col @@ -373,50 +403,56 @@ def transform( This method assumes inputs are already validated by fit() and uses pure Narwhals operations for all transformations. - :param X: Input data to transform - :type X: Union[TimeFrame, SupportedTemporalDataFrame, np.ndarray] - :param y: Ignored, exists for scikit-learn compatibility - :return: Transformed data - :rtype: Union[TimeFrame, SupportedTemporalDataFrame, np.ndarray] - - Example with TimeFrame: - -------------------- - .. code-block:: python - - import pandas as pd - from temporalscope.target_shifters.single_step import SingleStepTargetShifter - from temporalscope.core.temporal_data_loader import TimeFrame - - # Create TimeFrame - df = pd.DataFrame({"time": range(5), "target": range(5), "feature": range(5)}) - tf = TimeFrame(df=df, time_col="time", target_col="target") - - # Initialize and transform - shifter = SingleStepTargetShifter(n_lags=1) - shifter.fit(tf) - transformed_tf = shifter.transform(tf) - - Example with DataFrame: - ------------------- - .. code-block:: python - - import pandas as pd - from temporalscope.target_shifters.single_step import SingleStepTargetShifter - - # Create DataFrame - df = pd.DataFrame({"target": range(5), "feature": range(5)}) - - # Initialize and transform - shifter = SingleStepTargetShifter(target_col="target") - shifter.fit(df) - transformed_df = shifter.transform(df) - - .. note:: - Pure Narwhals implementation: - - _get_row_count() for counting - - _shift_target() for shifting - - Backend-agnostic operations - - Handles LazyFrame and PyArrow scalars + Parameters + ---------- + X : Union[TimeFrame, SupportedTemporalDataFrame, np.ndarray] + Input data to transform + y : + Ignored, exists for scikit-learn compatibility (Default value = None) + X: Union[TimeFrame, SupportedTemporalDataFrame, np.ndarray] : + + Returns + ------- + Union[TimeFrame, SupportedTemporalDataFrame, np.ndarray] + + Examples + -------- + ```python + import pandas as pd + from temporalscope.target_shifters.single_step import SingleStepTargetShifter + from temporalscope.core.temporal_data_loader import TimeFrame + + # Create TimeFrame + df = pd.DataFrame({"time": range(5), "target": range(5), "feature": range(5)}) + tf = TimeFrame(df=df, time_col="time", target_col="target") + + # Initialize and transform + shifter = SingleStepTargetShifter(n_lags=1) + shifter.fit(tf) + transformed_tf = shifter.transform(tf) + ``` + + ```python + import pandas as pd + from temporalscope.target_shifters.single_step import SingleStepTargetShifter + + # Create DataFrame + df = pd.DataFrame({"target": range(5), "feature": range(5)}) + + # Initialize and transform + shifter = SingleStepTargetShifter(target_col="target") + shifter.fit(df) + transformed_df = shifter.transform(df) + ``` + + Notes + ----- + Pure Narwhals implementation: + - _get_row_count() for counting + - _shift_target() for shifting + - Backend-agnostic operations + - Handles LazyFrame and PyArrow scalars transformed data + """ was_numpy = isinstance(X, np.ndarray) if was_numpy: @@ -473,46 +509,53 @@ def fit_transform( This method combines input validation (fit) with Narwhals transformations (transform) in a single operation. - :param X: Input data to transform - :type X: Union[TimeFrame, SupportedTemporalDataFrame, np.ndarray] - :param y: Ignored, exists for scikit-learn compatibility - :return: Transformed data - :rtype: Union[TimeFrame, SupportedTemporalDataFrame, np.ndarray] + Parameters + ---------- + X : Union[TimeFrame, SupportedTemporalDataFrame, np.ndarray] + Input data to transform + y : + Ignored, exists for scikit-learn compatibility (Default value = None) - Example with TimeFrame: - -------------------- - .. code-block:: python + Returns + ------- + Union[TimeFrame, SupportedTemporalDataFrame, np.ndarray] - import pandas as pd - from temporalscope.target_shifters.single_step import SingleStepTargetShifter - from temporalscope.core.temporal_data_loader import TimeFrame + Examples + -------- + ```python + import pandas as pd + from temporalscope.target_shifters.single_step import SingleStepTargetShifter + from temporalscope.core.temporal_data_loader import TimeFrame - # Create TimeFrame - df = pd.DataFrame({"time": range(5), "target": range(5), "feature": range(5)}) - tf = TimeFrame(df=df, time_col="time", target_col="target") + # Create TimeFrame + df = pd.DataFrame({"time": range(5), "target": range(5), "feature": range(5)}) + tf = TimeFrame(df=df, time_col="time", target_col="target") - # Initialize and transform - shifter = SingleStepTargetShifter(n_lags=1) - transformed_tf = shifter.fit_transform(tf) + # Initialize and transform + shifter = SingleStepTargetShifter(n_lags=1) + transformed_tf = shifter.fit_transform(tf) + ``` - Example with DataFrame: - ------------------- - .. code-block:: python + Examples + -------- + ```python + import pandas as pd + from temporalscope.target_shifters.single_step import SingleStepTargetShifter - import pandas as pd - from temporalscope.target_shifters.single_step import SingleStepTargetShifter + # Create DataFrame + df = pd.DataFrame({"target": range(5), "feature": range(5)}) - # Create DataFrame - df = pd.DataFrame({"target": range(5), "feature": range(5)}) + # Initialize and transform + shifter = SingleStepTargetShifter(target_col="target", n_lags=1) + transformed_df = shifter.fit_transform(df) + ``` - # Initialize and transform - shifter = SingleStepTargetShifter(target_col="target", n_lags=1) - transformed_df = shifter.fit_transform(df) + Notes + ----- + Operation Flow: + 1. fit(): Validates inputs + 2. transform(): Pure Narwhals operations + 3. Handles all backend types consistently transformed data - .. note:: - Operation Flow: - 1. fit(): Validates inputs - 2. transform(): Pure Narwhals operations - 3. Handles all backend types consistently """ return self.fit(X).transform(X) diff --git a/test/unit/datasets/test_synthetic_data_generator.py b/test/unit/datasets/test_synthetic_data_generator.py index b3aade9..cf320dd 100644 --- a/test/unit/datasets/test_synthetic_data_generator.py +++ b/test/unit/datasets/test_synthetic_data_generator.py @@ -172,14 +172,13 @@ def test_time_column_generation(backend: str, time_col_numeric: bool) -> None: assert pa.types.is_floating(time_type), "Expected PyArrow float column" else: assert isinstance(time_val, (np.float64, float)), "Expected numeric column" + elif isinstance(df, pl.DataFrame): + assert isinstance(time_val, datetime), "Expected Polars datetime column" + elif isinstance(df, pa.Table): + time_type = df.schema.field("time").type # type: ignore + assert isinstance(time_type, pa.TimestampType), "Expected PyArrow timestamp column" else: - if isinstance(df, pl.DataFrame): - assert isinstance(time_val, datetime), "Expected Polars datetime column" - elif isinstance(df, pa.Table): - time_type = df.schema.field("time").type # type: ignore - assert isinstance(time_type, pa.TimestampType), "Expected PyArrow timestamp column" - else: - assert isinstance(time_val, (pd.Timestamp, datetime)), "Expected timestamp column" + assert isinstance(time_val, (pd.Timestamp, datetime)), "Expected timestamp column" # ========================= Error Handling Tests =========================