Merge branch 'mlflow_docs' of github.com:Nixtla/statsforecast into ml…

…flow_docs
Nixtla · Oct 21, 2023 · 5b4969a · 5b4969a
2 parents 3e05112 + 9357acf
commit 5b4969a
Show file tree

Hide file tree

Showing 88 changed files with 608,439 additions and 55,011 deletions.
diff --git a/.github/workflows/build-docs.yaml b/.github/workflows/build-docs.yaml
@@ -0,0 +1,52 @@
+name: "build-docs"
+on:
+  push:
+    branches: ["main"]
+  pull_request:
+    branches: ["main"]
+  workflow_dispatch:
+
+defaults:
+  run:
+    shell: bash
+
+jobs:
+  build-docs:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          submodules: 'recursive'
+      - uses: actions/setup-python@v4
+        with:
+          cache: "pip"
+          python-version: '3.10'
+          cache-dependency-path: settings.ini
+      - name: Build docs
+        run: |
+          set -ux
+          python -m pip install --upgrade pip
+          pip install -Uq nbdev
+          pip install -e ".[dev]"
+          mkdir nbs/_extensions
+          cp -r docs-scripts/mintlify/ nbs/_extensions/
+          python docs-scripts/update-quarto.py
+          echo "procs = nbdev_plotly.plotly:PlotlyProc" >> settings.ini
+          nbdev_docs
+      - name: Apply final formats
+        run: bash ./docs-scripts/docs-final-formatting.bash
+      - name: Copy over necessary assets
+        run: |
+          cp nbs/mint.json _docs/mint.json
+          cp docs-scripts/imgs/* _docs/
+      - name: Deploy to Mintlify Docs
+        if: github.event_name == 'push'
+        uses: peaceiris/actions-gh-pages@v3
+        with:
+          github_token: ${{ secrets.GITHUB_TOKEN }}
+          publish_branch: docs
+          publish_dir: ./_docs
+          # The following lines assign commit authorship to the official GH-Actions bot for deploys to `docs` branch.
+          # You can swap them out with your own user credentials.
+          user_name: github-actions[bot]
+          user_email: 41898282+github-actions[bot]@users.noreply.github.com
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -14,6 +14,9 @@ concurrency:
   group: ${{ github.workflow }}-${{ github.ref }}
   cancel-in-progress: true
 
+env:
+  NIXTLA_NUMBA_CACHE: '1'
+
 jobs:
   nb-sync:
     runs-on: ubuntu-latest
@@ -62,6 +65,7 @@ jobs:
 
   run-tests:
     runs-on: ${{ matrix.os }}
+    timeout-minutes: 60
     strategy:
       fail-fast: false
       matrix:
@@ -87,7 +91,7 @@ jobs:
       - name: Run integration tests
         run: |
           pip install ".[dev]" pytest
-          pytest action_files      
+          pytest --durations=0 action_files
 
   test-m3-performance:
     runs-on: ubuntu-latest

diff --git a/.gitignore b/.gitignore
@@ -9,17 +9,18 @@ dist
 .vscode
 .idea
 *.gif
+*.icloud
 *.csv
 */data/*
 *.parquet
 tmp
 _docs/
 _proc/
-sidebar.yml
 .DS_Store
 .gitattributes
 .gitconfig
 nbs/.last_checked
 .venv
 .idea
-mlruns/
+mlruns/
+.luarc.json
diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,4 @@
+[submodule "docs-scripts"]
+	path = docs-scripts
+	url = [email protected]:Nixtla/docs.git
+	branch = scripts
diff --git a/README.md b/README.md
@@ -259,4 +259,4 @@ Thanks goes to these wonderful people ([emoji key](https://allcontributors.org/d
 
 <!-- ALL-CONTRIBUTORS-LIST:END -->
 
-This project follows the [all-contributors](https://github.com/all-contributors/all-contributors) specification. Contributions of any kind welcome!
+This project follows the [all-contributors](https://github.com/all-contributors/all-contributors) specification. Contributions of any kind welcome!
diff --git a/action_files/conftest.py b/action_files/conftest.py
@@ -0,0 +1,40 @@
+import numpy as np
+import pandas as pd
+import pytest
+
+from statsforecast.utils import generate_series
+
+
+@pytest.fixture
+def n_series():
+    return 2
+
+@pytest.fixture
+def horizon():
+    return 7
+
+@pytest.fixture()
+def local_data(n_series, horizon):
+    n_static = 2
+    series = generate_series(n_series, n_static_features=n_static)
+    static_features = []
+    for i in range(n_static):
+        name = f'static_{i}'
+        series[name] = series[name].astype(int)
+        static_features.append(name)
+    series['unique_id'] = series['unique_id'].astype(str)
+    uids = series['unique_id'].unique()
+    static_values = series.groupby('unique_id')[static_features].head(1)
+    static_values['unique_id'] = uids
+    last_train_dates = series.groupby('unique_id')['ds'].max()
+    pred_start = last_train_dates + pd.offsets.Day()
+    pred_end = last_train_dates + horizon * pd.offsets.Day()
+    pred_dates = np.hstack([pd.date_range(start, end) for start, end in zip(pred_start, pred_end)])
+    X_df = pd.DataFrame(
+        {
+            'unique_id': np.repeat(uids, horizon),
+            'ds': pred_dates,
+        }
+    )
+    X_df = X_df.merge(static_values, on='unique_id')
+    return series, X_df
diff --git a/action_files/test_dask.py b/action_files/test_dask.py
@@ -1,24 +1,19 @@
 import dask.dataframe as dd
 import pytest
 
-from statsforecast.utils import generate_series
 from .utils import pipeline, pipeline_with_level
 
-@pytest.fixture()
-def n_series():
-    return 2
+
+def to_distributed(df):
+    return dd.from_pandas(df, npartitions=2)
 
 @pytest.fixture()
-def sample_data(n_series):
-    series = generate_series(n_series).reset_index()
-    series['unique_id'] = series['unique_id'].astype(str)
-    series = dd.from_pandas(series, npartitions=2)
-    return series
+def sample_data(local_data):
+    series, X_df = local_data
+    return to_distributed(series), to_distributed(X_df)
 
-def test_dask_flow(sample_data, n_series):
-    horizon = 7
-    pipeline(sample_data, n_series, horizon)
+def test_dask_flow(horizon, sample_data, n_series):
+    pipeline(*sample_data, n_series, horizon)
 
-def test_dask_flow_with_level(sample_data, n_series):
-    horizon = 7
-    pipeline_with_level(sample_data, n_series, horizon)
+def test_dask_flow_with_level(horizon, sample_data, n_series):
+    pipeline_with_level(*sample_data, n_series, horizon)
diff --git a/action_files/test_ray.py b/action_files/test_ray.py
@@ -1,24 +1,23 @@
+import sys
+
 import pytest
 import ray
 
-from statsforecast.utils import generate_series
 from .utils import pipeline, pipeline_with_level
 
-@pytest.fixture()
-def n_series():
-    return 2
+
+def to_distributed(df):
+    return ray.data.from_pandas(df).repartition(2)
 
 @pytest.fixture()
-def sample_data(n_series):
-    series = generate_series(n_series).reset_index()
-    series['unique_id'] = series['unique_id'].astype(str)
-    series = ray.data.from_pandas(series).repartition(2)
-    return series
+def sample_data(local_data):
+    series, X_df = local_data
+    return to_distributed(series), to_distributed(X_df)
 
-def test_ray_flow(sample_data, n_series):
-    horizon = 7
-    pipeline(sample_data, n_series, horizon)
+@pytest.mark.skipif(sys.version_info < (3, 8), reason="requires python >= 3.8")
+def test_ray_flow(horizon, sample_data, n_series):
+    pipeline(*sample_data, n_series, horizon)
 
-def test_ray_flow_with_level(sample_data, n_series):
-    horizon = 7
-    pipeline_with_level(sample_data, n_series, horizon)
+@pytest.mark.skipif(sys.version_info < (3, 8), reason="requires python >= 3.8")
+def test_ray_flow_with_level(horizon, sample_data, n_series):
+    pipeline_with_level(*sample_data, n_series, horizon)
diff --git a/action_files/test_spark.py b/action_files/test_spark.py
@@ -1,26 +1,27 @@
+import sys
+
 import pytest
 from pyspark.sql import SparkSession
 
-from statsforecast.utils import generate_series
 from .utils import pipeline, pipeline_with_level
 
-@pytest.fixture()
-def n_series():
-    return 2
+
+@pytest.fixture
+def spark():
+    return SparkSession.builder.getOrCreate()
+
+def to_distributed(spark, df):
+    return spark.createDataFrame(df).repartition(2, 'unique_id')
 
 @pytest.fixture()
-def sample_data(n_series):
-    n_series = 2
-    series = generate_series(n_series).reset_index()
-    series['unique_id'] = series['unique_id'].astype(str)
-    spark = SparkSession.builder.getOrCreate()
-    series = spark.createDataFrame(series).repartition(2, 'unique_id')
-    return series
-
-def test_spark_flow(sample_data, n_series):
-    horizon = 7
-    pipeline(sample_data, n_series, horizon)
-
-def test_spark_flow_with_level(sample_data, n_series):
-    horizon = 7
-    pipeline_with_level(sample_data, n_series, horizon)
+def sample_data(spark, local_data):
+    series, X_df = local_data
+    return to_distributed(spark, series), to_distributed(spark, X_df)
+
+@pytest.mark.skipif(sys.version_info < (3, 8), reason="requires python >= 3.8")
+def test_spark_flow(horizon, sample_data, n_series):
+    pipeline(*sample_data, n_series, horizon)
+
+@pytest.mark.skipif(sys.version_info < (3, 8), reason="requires python >= 3.8")
+def test_spark_flow_with_level(horizon, sample_data, n_series):
+    pipeline_with_level(*sample_data, n_series, horizon)
diff --git a/action_files/utils.py b/action_files/utils.py
@@ -21,15 +21,19 @@
 )
 
 
-def pipeline(series, n_series, horizon):
+def pipeline(series, X_df, n_series, horizon):
     models = [
-		ADIDA(), AutoARIMA(season_length=7), 
-                ARIMA(season_length=7, order=(0, 1, 2)),
-		CrostonClassic(), CrostonOptimized(),
-		CrostonSBA(), AutoETS(season_length=7),
-		HistoricAverage(), 
-		IMAPA(), Naive(), 
-		RandomWalkWithDrift(), 
+		ADIDA(),
+        AutoARIMA(season_length=7),
+        ARIMA(season_length=7, order=(0, 1, 2)),
+		CrostonClassic(),
+        CrostonOptimized(),
+		CrostonSBA(),
+        AutoETS(season_length=7),
+		HistoricAverage(),
+		IMAPA(),
+        Naive(),
+		RandomWalkWithDrift(),
 		SeasonalExponentialSmoothing(season_length=7, alpha=0.1),
 		SeasonalNaive(season_length=7),
 		SeasonalWindowAverage(season_length=7, window_size=4),
@@ -41,19 +45,29 @@ def pipeline(series, n_series, horizon):
         models=models,
         freq='D',
     )
-    forecast = fa.as_pandas(sf.forecast(df=series, h=horizon))
+    forecast = fa.as_pandas(sf.forecast(df=series, h=horizon, X_df=X_df))
     print(forecast)
     assert forecast.shape == (n_series * horizon, len(models) + 2)
 
-def pipeline_with_level(series, n_series, horizon):
+    n_windows = 2
+    cv = fa.as_pandas(sf.cross_validation(df=series, n_windows=n_windows, h=horizon))
+    assert cv.shape[0] == n_series * n_windows * horizon
+    assert cv.columns.tolist() == ['unique_id', 'ds', 'cutoff', 'y'] + [m.alias for m in models]
+
+def pipeline_with_level(series, X_df, n_series, horizon):
     models = [
 		AutoARIMA(season_length=7), 
 	]
     sf = StatsForecast(
         models=models,
         freq='D',
     )
-    forecast = fa.as_pandas(sf.forecast(df=series, h=horizon, level=[80, 90]))
+    forecast = fa.as_pandas(sf.forecast(df=series, h=horizon, X_df=X_df, level=[80, 90]))
     print(forecast.columns)
     expected = ["unique_id","ds","AutoARIMA","AutoARIMA-lo-90","AutoARIMA-hi-90", "AutoARIMA-lo-80","AutoARIMA-hi-80"]
-    assert forecast.shape == (n_series * horizon, len(expected))
+    assert forecast.shape == (n_series * horizon, len(expected))
+
+    n_windows = 2
+    cv = fa.as_pandas(sf.cross_validation(df=series, n_windows=n_windows, h=horizon, level=[80]))
+    assert cv.shape[0] == n_series * n_windows * horizon
+    assert cv.columns.tolist() == ['unique_id', 'ds', 'cutoff', 'y', 'AutoARIMA', 'AutoARIMA-lo-80', 'AutoARIMA-hi-80']
diff --git a/dev/environment.yml b/dev/environment.yml
@@ -16,6 +16,7 @@ dependencies:
   - statsmodels>=0.13.2
   - tabulate
   - plotly
+  - utilsforecast>=0.0.5
   - pip:
     - fugue[dask,ray]
     - nbdev

diff --git a/dev/local_environment.yml b/dev/local_environment.yml
@@ -15,6 +15,7 @@ dependencies:
   - statsmodels>=0.13.2
   - tabulate
   - plotly
+  - utilsforecast>=0.0.5
   - pip:
     - nbdev
     - plotly-resampler

diff --git a/dev/requirements.txt b/dev/requirements.txt
@@ -1,17 +1,21 @@
-holidays<0.21 
+holidays<0.21
 jupyterlab
 matplotlib
 numba>=0.55.0
 numpy>=1.21.6
 pandas>=1.3.5
+pyspark>=3.3
 pip
 prophet
+pyarrow
 scipy>=1.7.3
 statsmodels>=0.13.2
 tabulate
 plotly
+utilsforecast>=0.0.5
+fugue[dask,ray]
 nbdev
-tqdm
 plotly-resampler
 polars
-supersmoother
+supersmoother
+tqdm
diff --git a/docs-scripts b/docs-scripts
diff --git a/experiments/ces/src/ces.py b/experiments/ces/src/ces.py
@@ -2,8 +2,8 @@
 import time
 from functools import partial
 from multiprocessing import cpu_count
-os.environ['NUMBA_RELEASE_GIL'] = 'True'
-#os.environ['NUMBA_CACHE'] = 'True'
+os.environ['NIXTLA_NUMBA_RELEASE_GIL'] = '1'
+os.environ['NIXTLA_NUMBA_CACHE'] = '1'
 
 import fire
 import numpy as np
Original file line number	Diff line number	Diff line change
Expand Up		@@ -259,4 +259,4 @@ Thanks goes to these wonderful people ([emoji key](https://allcontributors.org/d

		<!-- ALL-CONTRIBUTORS-LIST:END -->

		This project follows the [all-contributors](https://github.com/all-contributors/all-contributors) specification. Contributions of any kind welcome!
		This project follows the [all-contributors](https://github.com/all-contributors/all-contributors) specification. Contributions of any kind welcome!