diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
index 1ddf28ea7..b470772ec 100644
--- a/.github/workflows/CI.yml
+++ b/.github/workflows/CI.yml
@@ -30,8 +30,8 @@ jobs:
os: [ubuntu-latest]
test-id: [main]
include:
- - julia-version: '1.6'
- python-version: '3.8'
+ - julia-version: '1.10'
+ python-version: '3.10'
os: ubuntu-latest
test-id: include
- julia-version: '1'
@@ -58,26 +58,18 @@ jobs:
- name: "Install PySR"
run: |
python -m pip install --upgrade pip
- pip install .
+ pip install '.[dev]'
python -c 'import pysr'
- name: "Assert Julia version"
if: ${{ matrix.julia-version != '1'}}
run: python3 -c "from pysr import jl; assert jl.VERSION.major == jl.seval('v\"${{ matrix.julia-version }}\"').major; assert jl.VERSION.minor == jl.seval('v\"${{ matrix.julia-version }}\"').minor"
- - name: "Install test dependencies"
- run: pip install coverage coveralls pytest nbval
- name: "Set up coverage for subprocesses"
run: echo 'import coverage; coverage.process_startup()' > "${{ github.workspace }}/sitecustomize.py"
- name: "Run tests"
run: coverage run -m pysr test main,cli,startup
- - name: "Install JAX"
- run: pip install jax jaxlib # (optional import)
- if: ${{ matrix.test-id == 'main' }}
- name: "Run JAX tests"
run: coverage run --append -m pysr test jax
if: ${{ matrix.test-id == 'main' }}
- - name: "Install Torch"
- run: pip install torch # (optional import)
- if: ${{ matrix.test-id == 'main' }}
- name: "Run Torch tests"
run: coverage run --append -m pysr test torch
if: ${{ matrix.test-id == 'main' }}
@@ -97,15 +89,15 @@ jobs:
julia-version: ['1']
include:
- os: ubuntu-latest
- python-version: '3.8'
- julia-version: '1.6'
+ python-version: '3.10'
+ julia-version: '1.10'
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
- name: "Install PySR"
run: |
python -m pip install --upgrade pip
- pip install .
+ pip install '.[dev]'
- name: "Run development test"
run: PYSR_TEST_JULIA_VERSION=${{ matrix.julia-version }} PYSR_TEST_PYTHON_VERSION=${{ matrix.python-version }} python -m pysr test dev
@@ -131,7 +123,7 @@ jobs:
- name: "Set up Conda"
uses: conda-incubator/setup-miniconda@v3
with:
- miniforge-variant: Mambaforge
+ miniforge-variant: Miniforge3
miniforge-version: latest
auto-activate-base: true
python-version: ${{ matrix.python-version }}
@@ -176,7 +168,7 @@ jobs:
matrix:
python-version:
- '3.12'
- - '3.8'
+ - '3.10'
os: ['ubuntu-latest']
steps:
@@ -189,14 +181,37 @@ jobs:
- name: "Install PySR and all dependencies"
run: |
python -m pip install --upgrade pip
- pip install .
- pip install mypy
- - name: "Install additional dependencies"
- run: python -m pip install jax jaxlib torch
- if: ${{ matrix.python-version != '3.8' }}
+ pip install '.[dev]'
- name: "Run mypy"
run: python -m mypy --install-types --non-interactive pysr
- if: ${{ matrix.python-version != '3.8' }}
+ if: ${{ matrix.python-version != '3.10' }}
- name: "Run compatible mypy"
run: python -m mypy --ignore-missing-imports pysr
- if: ${{ matrix.python-version == '3.8' }}
+ if: ${{ matrix.python-version == '3.10' }}
+
+ beartype:
+ name: Test with beartype
+ runs-on: ubuntu-latest
+ defaults:
+ run:
+ shell: bash -l {0}
+ env:
+ PYSR_USE_BEARTYPE: "1"
+ strategy:
+ matrix:
+ python-version: ['3.12']
+ os: ['ubuntu-latest']
+
+ steps:
+ - uses: actions/checkout@v4
+ - name: "Set up Python"
+ uses: actions/setup-python@v5
+ with:
+ python-version: ${{ matrix.python-version }}
+ cache: pip
+ - name: "Install PySR and all dependencies"
+ run: |
+ python -m pip install --upgrade pip
+ pip install '.[dev]'
+ - name: "Run tests"
+ run: python -m pysr test main,jax,torch
diff --git a/.github/workflows/CI_Windows.yml b/.github/workflows/CI_Windows.yml
index f66644342..a62b7af6e 100644
--- a/.github/workflows/CI_Windows.yml
+++ b/.github/workflows/CI_Windows.yml
@@ -46,8 +46,7 @@ jobs:
- name: "Install PySR"
run: |
python -m pip install --upgrade pip
- pip install pytest nbval "numpy<2.0.0"
- pip install .
+ pip install '.[dev]'
python -c 'import pysr'
- name: "Run tests"
run: |
diff --git a/.github/workflows/CI_conda_forge.yml b/.github/workflows/CI_conda_forge.yml
index 98d064b93..097a22bc2 100644
--- a/.github/workflows/CI_conda_forge.yml
+++ b/.github/workflows/CI_conda_forge.yml
@@ -20,17 +20,17 @@ jobs:
strategy:
fail-fast: false
matrix:
- python-version: ['3.9', '3.10', '3.11', '3.12']
+ python-version: ['3.10', '3.11', '3.12']
os: ['ubuntu-latest']
use-mamba: [true, false]
include:
- - python-version: 3.9
+ - python-version: 3.10
os: 'windows-latest'
use-mamba: true
- python-version: 3.12
os: 'windows-latest'
use-mamba: true
- - python-version: 3.9
+ - python-version: 3.10
os: 'macos-latest'
use-mamba: true
- python-version: 3.12
diff --git a/.github/workflows/CI_docker_large_nightly.yml b/.github/workflows/CI_docker_large_nightly.yml
index 185383ccf..4e6dbf8ba 100644
--- a/.github/workflows/CI_docker_large_nightly.yml
+++ b/.github/workflows/CI_docker_large_nightly.yml
@@ -18,8 +18,8 @@ jobs:
strategy:
fail-fast: false
matrix:
- julia-version: ['1.6', '1']
- python-version: ['3.8', '3.12']
+ julia-version: ['1.10', '1']
+ python-version: ['3.10', '3.12']
os: [ubuntu-latest]
arch: ['linux/amd64', 'linux/arm64']
diff --git a/.github/workflows/CI_large_nightly.yml b/.github/workflows/CI_large_nightly.yml
index cbd9a7ef3..4598468bf 100644
--- a/.github/workflows/CI_large_nightly.yml
+++ b/.github/workflows/CI_large_nightly.yml
@@ -23,8 +23,8 @@ jobs:
strategy:
fail-fast: false
matrix:
- julia-version: ['1.6', '1.8', '1.10']
- python-version: ['3.8', '3.10', '3.12']
+ julia-version: ['1.10', '1']
+ python-version: ['3.10', '3.12']
os: [ubuntu-latest, macos-latest, windows-latest]
steps:
@@ -40,8 +40,7 @@ jobs:
- name: "Install PySR"
run: |
python -m pip install --upgrade pip
- pip install pytest nbval
- pip install .
+ pip install '.[dev]'
python -c 'import pysr'
- name: "Assert Julia version"
if: ${{ matrix.julia-version != '1'}}
diff --git a/.github/workflows/CI_mac.yml b/.github/workflows/CI_mac.yml
index fab20425c..68a940ee3 100644
--- a/.github/workflows/CI_mac.yml
+++ b/.github/workflows/CI_mac.yml
@@ -46,17 +46,12 @@ jobs:
- name: "Install PySR"
run: |
python -m pip install --upgrade pip
- pip install pytest nbval
- pip install .
+ pip install '.[dev]'
python -c 'import pysr'
- name: "Run tests"
run: |
python -m pysr test main,cli,startup
- - name: "Install JAX"
- run: pip install jax jaxlib # (optional import)
- name: "Run JAX tests"
run: python -m pysr test jax
- - name: "Install Torch"
- run: pip install torch # (optional import)
- name: "Run Torch tests"
run: python -m pysr test torch
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
index af36db783..c277815c3 100644
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -25,7 +25,7 @@ jobs:
- name: "Set up Python"
uses: actions/setup-python@v5
with:
- python-version: 3.9
+ python-version: 3.12
cache: pip
- name: "Install packages for docs building"
run: pip install -r docs/requirements.txt
@@ -33,5 +33,18 @@ jobs:
run: pip install . && python -c 'import pysr'
- name: "Build API docs"
run: cd docs && ./gen_docs.sh
- - name: "Deploy documentation"
+ - name: "Deploy documentation to primary repository"
run: mkdocs gh-deploy --force
+ - name: "Deploy documentation to secondary repository"
+ env:
+ DEPLOY_KEY: ${{ secrets.DAMTP_DEPLOY_KEY }}
+ run: |
+ # Set up SSH key for authentication
+ mkdir -p ~/.ssh
+ echo "$DEPLOY_KEY" > ~/.ssh/id_rsa
+ chmod 600 ~/.ssh/id_rsa
+ ssh-keyscan github.com >> ~/.ssh/known_hosts
+
+ git checkout gh-pages
+ git remote add secondary git@github.com:ai-damtp-cam-ac-uk/pysr.git
+ git push secondary gh-pages --force
diff --git a/.gitignore b/.gitignore
index c40841695..c864e9903 100644
--- a/.gitignore
+++ b/.gitignore
@@ -25,3 +25,5 @@ site
venv
requirements-dev.lock
requirements.lock
+outputs
+.mypy_cache
diff --git a/Apptainer.def b/Apptainer.def
index 962f81687..baad8a4c8 100644
--- a/Apptainer.def
+++ b/Apptainer.def
@@ -1,10 +1,10 @@
# Build an Apptainer SIF file containing a working copy of PySR and its prereqs
Bootstrap: docker
-From: julia:1.10.4-bullseye
+From: julia:1.11.1-bullseye
Stage: jl
Bootstrap: docker
-From: python:3.12-bullseye
+From: python:3.12.6-bullseye
Stage: runtime
%environment
@@ -19,7 +19,6 @@ Stage: runtime
/usr/local/julia /usr/local/julia
%files
- ./requirements.txt /pysr/requirements.txt
./pyproject.toml /pysr/pyproject.toml
./setup.py /pysr/setup.py
./pysr /pysr/pysr
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 409cb7f1c..b7e245b59 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -6,7 +6,7 @@ In this guide you will get an overview of the contribution workflow from opening
## New contributor guide
-To get an overview of the project, read PySR's [README](README.md). The [PySR docs](https://astroautomata.com/PySR/) give additional information.
+To get an overview of the project, read PySR's [README](README.md). The [PySR docs](https://ai.damtp.cam.ac.uk/pysr/) give additional information.
Here are some resources to help you get started with open source contributions in general:
- [Finding ways to contribute to open source on GitHub](https://docs.github.com/en/get-started/exploring-projects-on-github/finding-ways-to-contribute-to-open-source-on-github)
@@ -39,7 +39,7 @@ Scan through our [existing issues](https://github.com/MilesCranmer/PySR/issues)
2. Create a working branch and start with your changes!
3. (Optional) If you would like to make changes to PySR itself, skip to step 4. However, if you are interested in making changes to the _symbolic regression code_ itself,
-check out the [guide](https://astroautomata.com/PySR/backend/) on modifying a custom SymbolicRegression.jl library.
+check out the [guide](https://ai.damtp.cam.ac.uk/pysr/backend/) on modifying a custom SymbolicRegression.jl library.
In this case, you might instead be interested in making suggestions to the [SymbolicRegression.jl](http://github.com/MilesCranmer/SymbolicRegression.jl) library.
4. You can install your local version of PySR with `python setup.py install`, and run tests with `python -m pysr test main`.
diff --git a/Dockerfile b/Dockerfile
index ed0b5f891..a446f3278 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,8 +1,8 @@
# This builds a dockerfile containing a working copy of PySR
# with all pre-requisites installed.
-ARG JLVERSION=1.10.4
-ARG PYVERSION=3.12.2
+ARG JLVERSION=1.11.1
+ARG PYVERSION=3.12.6
ARG BASE_IMAGE=bullseye
FROM julia:${JLVERSION}-${BASE_IMAGE} AS jl
@@ -17,10 +17,6 @@ RUN pip install --no-cache-dir ipython matplotlib
WORKDIR /pysr
-# Caches install (https://stackoverflow.com/questions/25305788/how-to-avoid-reinstalling-packages-when-building-docker-image-for-python-project)
-ADD ./requirements.txt /pysr/requirements.txt
-RUN pip3 install --no-cache-dir -r /pysr/requirements.txt
-
# Install PySR:
# We do a minimal copy so it doesn't need to rerun at every file change:
ADD ./pyproject.toml /pysr/pyproject.toml
diff --git a/README.md b/README.md
index aa1801933..299f019a9 100644
--- a/README.md
+++ b/README.md
@@ -11,7 +11,7 @@ https://github.com/MilesCranmer/PySR/assets/7593028/c8511a49-b408-488f-8f18-b174
| **Docs** | **Forums** | **Paper** | **colab demo** |
|:---:|:---:|:---:|:---:|
-|[![Documentation](https://github.com/MilesCranmer/PySR/actions/workflows/docs.yml/badge.svg)](https://astroautomata.com/PySR/)|[![Discussions](https://img.shields.io/badge/discussions-github-informational)](https://github.com/MilesCranmer/PySR/discussions)|[![Paper](https://img.shields.io/badge/arXiv-2305.01582-b31b1b)](https://arxiv.org/abs/2305.01582)|[![Colab](https://img.shields.io/badge/colab-notebook-yellow)](https://colab.research.google.com/github/MilesCranmer/PySR/blob/master/examples/pysr_demo.ipynb)|
+|[![Documentation](https://github.com/MilesCranmer/PySR/actions/workflows/docs.yml/badge.svg)](https://ai.damtp.cam.ac.uk/pysr/)|[![Discussions](https://img.shields.io/badge/discussions-github-informational)](https://github.com/MilesCranmer/PySR/discussions)|[![Paper](https://img.shields.io/badge/arXiv-2305.01582-b31b1b)](https://arxiv.org/abs/2305.01582)|[![Colab](https://img.shields.io/badge/colab-notebook-yellow)](https://colab.research.google.com/github/MilesCranmer/PySR/blob/master/examples/pysr_demo.ipynb)|
| **pip** | **conda** | **Stats** |
| :---: | :---: | :---: |
@@ -20,14 +20,14 @@ https://github.com/MilesCranmer/PySR/assets/7593028/c8511a49-b408-488f-8f18-b174
If you find PySR useful, please cite the paper [arXiv:2305.01582](https://arxiv.org/abs/2305.01582).
-If you've finished a project with PySR, please submit a PR to showcase your work on the [research showcase page](https://astroautomata.com/PySR/papers)!
+If you've finished a project with PySR, please submit a PR to showcase your work on the [research showcase page](https://ai.damtp.cam.ac.uk/pysr/papers)!
**Contents**:
- [Why PySR?](#why-pysr)
- [Installation](#installation)
- [Quickstart](#quickstart)
-- [→ Documentation](https://astroautomata.com/PySR)
+- [→ Documentation](https://ai.damtp.cam.ac.uk/PySR)
- [Contributors](#contributors-)
@@ -178,6 +178,7 @@ PySR's main interface is in the style of scikit-learn:
from pysr import PySRRegressor
model = PySRRegressor(
+ maxsize=20,
niterations=40, # < Increase me for better results
binary_operators=["+", "*"],
unary_operators=[
@@ -254,22 +255,21 @@ model = PySRRegressor.from_file("hall_of_fame.2022-08-10_100832.281.pkl")
There are several other useful features such as denoising (e.g., `denoise=True`),
feature selection (e.g., `select_k_features=3`).
-For examples of these and other features, see the [examples page](https://astroautomata.com/PySR/examples).
-For a detailed look at more options, see the [options page](https://astroautomata.com/PySR/options).
-You can also see the full API at [this page](https://astroautomata.com/PySR/api).
-There are also tips for tuning PySR on [this page](https://astroautomata.com/PySR/tuning).
+For examples of these and other features, see the [examples page](https://ai.damtp.cam.ac.uk/pysr/examples).
+For a detailed look at more options, see the [options page](https://ai.damtp.cam.ac.uk/pysr/options).
+You can also see the full API at [this page](https://ai.damtp.cam.ac.uk/pysr/api).
+There are also tips for tuning PySR on [this page](https://ai.damtp.cam.ac.uk/pysr/tuning).
### Detailed Example
The following code makes use of as many PySR features as possible.
Note that is just a demonstration of features and you should not use this example as-is.
-For details on what each parameter does, check out the [API page](https://astroautomata.com/PySR/api/).
+For details on what each parameter does, check out the [API page](https://ai.damtp.cam.ac.uk/pysr/api/).
```python
model = PySRRegressor(
- procs=4,
populations=8,
- # ^ 2 populations per core, so one is always running.
+ # ^ Assuming we have 4 cores, this means 2 populations per core, so one is always running.
population_size=50,
# ^ Slightly larger populations, for greater diversity.
ncycles_per_iteration=500,
diff --git a/docs/_api.md b/docs/_api.md
index bbc6c1f2f..38b68818e 100644
--- a/docs/_api.md
+++ b/docs/_api.md
@@ -60,3 +60,43 @@ PARAMSKEY
show_root_heading: true
heading_level: 3
show_root_full_path: false
+
+## Expression Specifications
+
+::: pysr.ExpressionSpec
+ options:
+ show_root_heading: true
+ heading_level: 3
+ show_root_full_path: false
+
+::: pysr.TemplateExpressionSpec
+ options:
+ show_root_heading: true
+ heading_level: 3
+ show_root_full_path: false
+
+::: pysr.ParametricExpressionSpec
+ options:
+ show_root_heading: true
+ heading_level: 3
+ show_root_full_path: false
+
+::: pysr.AbstractExpressionSpec
+ options:
+ show_root_heading: true
+ heading_level: 3
+ show_root_full_path: false
+
+## Logger Specifications
+
+::: pysr.TensorBoardLoggerSpec
+ options:
+ show_root_heading: true
+ heading_level: 3
+ show_root_full_path: false
+
+::: pysr.AbstractLoggerSpec
+ options:
+ show_root_heading: true
+ heading_level: 3
+ show_root_full_path: false
diff --git a/docs/backend.md b/docs/backend.md
index b7575d143..092251fbe 100644
--- a/docs/backend.md
+++ b/docs/backend.md
@@ -33,9 +33,9 @@ The main search code can be found in `src/SymbolicRegression.jl`.
Here are some tips:
-- The documentation for the backend is given [here](https://astroautomata.com/SymbolicRegression.jl/dev/).
+- The documentation for the backend is given [here](https://ai.damtp.cam.ac.uk/symbolicregression/dev/).
- Throughout the package, you will often see template functions which typically use a symbol `T` (such as in the string `where {T<:Real}`). Here, `T` is simply the datatype of the input data and stored constants, such as `Float32` or `Float64`. Writing functions in this way lets us write functions generic to types, while still having access to the specific type specified at compilation time.
-- Expressions are stored as binary trees, using the `Node{T}` type, described [here](https://astroautomata.com/SymbolicRegression.jl/dev/types/#SymbolicRegression.CoreModule.EquationModule.Node).
+- Expressions are stored as binary trees, using the `Node{T}` type, described [here](https://ai.damtp.cam.ac.uk/symbolicregression/dev/types/#SymbolicRegression.CoreModule.EquationModule.Node).
- For reference, the main loop itself is found in the `equation_search` function inside [`src/SymbolicRegression.jl`](https://github.com/MilesCranmer/SymbolicRegression.jl/blob/master/src/SymbolicRegression.jl).
- Parts of the code which are typically edited by users include:
- [`src/CheckConstraints.jl`](https://github.com/MilesCranmer/SymbolicRegression.jl/blob/master/src/CheckConstraints.jl), particularly the function `check_constraints`. This function checks whether a given expression satisfies constraints, such as having a complexity lower than `maxsize`, and whether it contains any forbidden nestings of functions.
@@ -70,6 +70,6 @@ For more information on `juliapkg.json`, see [`pyjuliapkg`](https://github.com/J
## Additional notes
-If you get comfortable enough with the backend, you might consider using the Julia package directly: the API is given on the [SymbolicRegression.jl documentation](https://astroautomata.com/SymbolicRegression.jl/dev/).
+If you get comfortable enough with the backend, you might consider using the Julia package directly: the API is given on the [SymbolicRegression.jl documentation](https://ai.damtp.cam.ac.uk/symbolicregression/dev/).
If you make a change that you think could be useful to other users, don't hesitate to open a pull request on either the PySR or SymbolicRegression.jl repositories! Contributions are very appreciated.
diff --git a/docs/examples.md b/docs/examples.md
index 754875e7c..8d01b8491 100644
--- a/docs/examples.md
+++ b/docs/examples.md
@@ -345,7 +345,7 @@ a real number from the loss function). But, you don't need to worry about this,
make sure to return a scalar number of type `L`.
The `tree` argument is the current expression being evaluated. You can read
-about the `tree` fields [here](https://astroautomata.com/SymbolicRegression.jl/stable/types/).
+about the `tree` fields [here](https://ai.damtp.cam.ac.uk/symbolicregression/stable/types/).
For example, let's fix a symbolic form of an expression,
as a rational function. i.e., $P(X)/Q(X)$ for polynomials $P$ and $Q$.
@@ -523,7 +523,128 @@ Note that this expression has a large dynamic range so may be difficult to find.
Note that you can also search for exclusively dimensionless constants by settings
`dimensionless_constants_only` to `true`.
-## 11. Additional features
+## 11. Expression Specifications
+
+PySR 1.0 introduces powerful expression specifications that allow you to define structured equations. Here are two examples:
+
+### Template Expressions
+
+`TemplateExpressionSpec` allows you to define a specific structure for the equation.
+For example, let's say we want to learn an equation of the form:
+
+$$ y = \sin(f(x_1, x_2)) + g(x_3) $$
+
+We can do this as follows:
+
+```python
+import numpy as np
+from pysr import PySRRegressor, TemplateExpressionSpec
+
+# Create data
+X = np.random.randn(1000, 3)
+y = np.sin(X[:, 0] + X[:, 1]) + X[:, 2]**2
+
+# Define template: we want sin(f(x1, x2)) + g(x3)
+template = TemplateExpressionSpec(
+ function_symbols=["f", "g"],
+ combine="((; f, g), (x1, x2, x3)) -> sin(f(x1, x2)) + g(x3)",
+)
+
+model = PySRRegressor(
+ expression_spec=template,
+ binary_operators=["+", "*", "-", "/"],
+ unary_operators=["sin"],
+ maxsize=10,
+)
+model.fit(X, y)
+```
+
+You can also use no argument-functions for learning constants, like:
+
+```python
+template = TemplateExpressionSpec(
+ function_symbols=["a", "f"],
+ combine="((; a, f), (x, y)) -> a() * sin(f(x, y))",
+)
+```
+
+### Parametric Expressions
+
+When your data has categories with shared equation structure but different parameters,
+you can use a `ParametricExpressionSpec`. Let's say we would like to learn the expression:
+
+$$ y = \alpha \sin(x_1) + \beta $$
+
+for three different values of $\alpha$ and $\beta$.
+
+```python
+import numpy as np
+from pysr import PySRRegressor, ParametricExpressionSpec
+
+# Create data with 3 categories
+X = np.random.uniform(-3, 3, (1000, 2))
+category = np.random.randint(0, 3, 1000)
+
+# Parameters for each category
+offsets = [0.1, 1.5, -0.5]
+scales = [1.0, 2.0, 0.5]
+
+# y = scale[category] * sin(x1) + offset[category]
+y = np.array([
+ scales[c] * np.sin(x1) + offsets[c]
+ for x1, c in zip(X[:, 0], category)
+])
+
+model = PySRRegressor(
+ expression_spec=ParametricExpressionSpec(max_parameters=2),
+ binary_operators=["+", "*", "-", "/"],
+ unary_operators=["sin"],
+ maxsize=10,
+)
+model.fit(X, y, category=category)
+
+# Predicting on new data:
+# model.predict(X_test, category=category_test)
+```
+
+See [Expression Specifications](expression-specs.md) for more details.
+
+## 12. Using TensorBoard for Logging
+
+You can use TensorBoard to visualize the search progress, as well as
+record hyperparameters and final metrics (like `min_loss` and `pareto_volume` - the latter of which
+is a performance measure of the entire Pareto front).
+
+```python
+import numpy as np
+from pysr import PySRRegressor, TensorBoardLoggerSpec
+
+rstate = np.random.RandomState(42)
+
+# Uniform dist between -3 and 3:
+X = rstate.uniform(-3, 3, (1000, 2))
+y = np.exp(X[:, 0]) + X[:, 1]
+
+# Create a logger that writes to "logs/run*":
+logger_spec = TensorBoardLoggerSpec(
+ log_dir="logs/run",
+ log_interval=10, # Log every 10 iterations
+)
+
+model = PySRRegressor(
+ binary_operators=["+", "*", "-", "/"],
+ logger_spec=logger_spec,
+)
+model.fit(X, y)
+```
+
+You can then view the logs with:
+
+```bash
+tensorboard --logdir logs/
+```
+
+## 13. Additional features
For the many other features available in PySR, please
read the [Options section](options.md).
diff --git a/docs/options.md b/docs/options.md
index 0ccacbcab..7a8f5c11e 100644
--- a/docs/options.md
+++ b/docs/options.md
@@ -276,7 +276,7 @@ model = PySRRegressor(..., weights=weights, elementwise_loss="myloss(x, y, w) =
model.fit(..., weights=weights)
```
-Built-in loss (faster) (see [losses](https://astroautomata.com/SymbolicRegression.jl/dev/losses/)).
+Built-in loss (faster) (see [losses](https://ai.damtp.cam.ac.uk/symbolicregression/dev/losses/)).
This one computes the L3 norm:
```python
diff --git a/environment.yml b/environment.yml
index c7d6ceebf..994390c98 100644
--- a/environment.yml
+++ b/environment.yml
@@ -2,10 +2,10 @@ name: test
channels:
- conda-forge
dependencies:
- - python>=3.8
+ - python>=3.10
- sympy>=1.0.0,<2.0.0
- pandas>=0.21.0,<3.0.0
- numpy>=1.13.0,<2.0.0
- scikit-learn>=1.0.0,<2.0.0
- - pyjuliacall>=0.9.21,<0.9.22
+ - pyjuliacall>=0.9.22,<0.9.23
- click>=7.0.0,<9.0.0
diff --git a/examples/pysr_demo.ipynb b/examples/pysr_demo.ipynb
index 8822effec..fabf07971 100644
--- a/examples/pysr_demo.ipynb
+++ b/examples/pysr_demo.ipynb
@@ -321,7 +321,7 @@
"id": "qvgVbOoSFtQY"
},
"source": [
- "A full list of operators is given here: https://astroautomata.com/PySR/operators,\n",
+ "A full list of operators is given here: https://ai.damtp.cam.ac.uk/pysr/operators,\n",
"but we can also use any binary or unary operator in `julia`, or define our own as arbitrary functions.\n",
"\n",
"Say that we want a command to do quartic powers:\n",
@@ -1498,7 +1498,7 @@
"id": "S5dO61g1bDhk"
},
"source": [
- "The full list of PySR parameters can be found here: https://astroautomata.com/PySR/api"
+ "The full list of PySR parameters can be found here: https://ai.damtp.cam.ac.uk/pysr/api"
]
}
],
diff --git a/mkdocs.yml b/mkdocs.yml
index cff11241c..c996a0062 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -36,7 +36,7 @@ nav:
- interactive-docs.md
extra:
- homepage: https://astroautomata.com/PySR
+ homepage: https://ai.damtp.cam.ac.uk/pysr
extra_css:
- stylesheets/extra.css
diff --git a/pyproject.toml b/pyproject.toml
index 64f4673a7..b496a2213 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,20 +4,49 @@ build-backend = "setuptools.build_meta"
[project]
name = "pysr"
-version = "0.19.4"
+version = "1.0.0"
authors = [
{name = "Miles Cranmer", email = "miles.cranmer@gmail.com"},
]
description = "Simple and efficient symbolic regression"
readme = {file = "README.md", content-type = "text/markdown"}
license = {file = "LICENSE"}
-requires-python = ">=3.8"
+requires-python = ">=3.10"
classifiers = [
"Programming Language :: Python :: 3",
"Operating System :: OS Independent",
"License :: OSI Approved :: Apache Software License"
]
-dynamic = ["dependencies"]
+dependencies = [
+ "sympy>=1.0.0,<2.0.0",
+ "pandas>=0.21.0,<3.0.0",
+ "numpy>=1.13.0,<3.0.0",
+ "scikit_learn>=1.0.0,<2.0.0",
+ "juliacall==0.9.23",
+ "click>=7.0.0,<9.0.0",
+ "setuptools>=50.0.0",
+]
+
+[project.optional-dependencies]
+dev = [
+ "beartype>=0.19,<0.20",
+ "coverage>=7,<8",
+ "coveralls>=4,<5",
+ "ipykernel>=6,<7",
+ "ipython>=8,<9",
+ "jax[cpu]>=0.4,<0.5",
+ "jupyter>=1,<2",
+ "mypy>=1,<2",
+ "nbval>=0.11,<0.12",
+ "pandas-stubs",
+ "pre-commit>=3.7,<5",
+ "pytest-cov>=5,<7",
+ "pytest>=8,<9",
+ "tensorboard>=2,<3",
+ "torch>=2,<3",
+ "types-openpyxl",
+ "types-pytz",
+]
[tool.setuptools]
packages = ["pysr", "pysr._cli", "pysr.test"]
@@ -29,17 +58,3 @@ dependencies = {file = "requirements.txt"}
[tool.isort]
profile = "black"
-
-[tool.rye]
-dev-dependencies = [
- "pre-commit>=3.7.0",
- "ipython>=8.23.0",
- "ipykernel>=6.29.4",
- "mypy>=1.10.0",
- "jax[cpu]>=0.4.26",
- "torch>=2.3.0",
- "pandas-stubs>=2.2.1.240316",
- "types-pytz>=2024.1.0.20240417",
- "types-openpyxl>=3.1.0.20240428",
- "coverage>=7.5.3",
-]
diff --git a/pysr/__init__.py b/pysr/__init__.py
index b40ee840e..e26174aba 100644
--- a/pysr/__init__.py
+++ b/pysr/__init__.py
@@ -1,3 +1,10 @@
+import os
+
+if os.environ.get("PYSR_USE_BEARTYPE", "0") == "1":
+ from beartype.claw import beartype_this_package
+
+ beartype_this_package()
+
# This must be imported as early as possible to prevent
# library linking issues caused by numpy/pytorch/etc. importing
# old libraries:
@@ -7,7 +14,14 @@
from .deprecated import best, best_callable, best_row, best_tex, install, pysr
from .export_jax import sympy2jax
from .export_torch import sympy2torch
+from .expression_specs import (
+ AbstractExpressionSpec,
+ ExpressionSpec,
+ ParametricExpressionSpec,
+ TemplateExpressionSpec,
+)
from .julia_extensions import load_all_packages
+from .logger_specs import AbstractLoggerSpec, TensorBoardLoggerSpec
from .sr import PySRRegressor
# This file is created by setuptools_scm during the build process:
@@ -22,6 +36,12 @@
"install",
"load_all_packages",
"PySRRegressor",
+ "AbstractExpressionSpec",
+ "ExpressionSpec",
+ "TemplateExpressionSpec",
+ "ParametricExpressionSpec",
+ "AbstractLoggerSpec",
+ "TensorBoardLoggerSpec",
"best",
"best_callable",
"best_row",
diff --git a/pysr/denoising.py b/pysr/denoising.py
index 5ab6a168e..92a640ee8 100644
--- a/pysr/denoising.py
+++ b/pysr/denoising.py
@@ -1,6 +1,6 @@
"""Functions for denoising data during preprocessing."""
-from typing import Optional, Tuple, cast
+from typing import cast
import numpy as np
from numpy import ndarray
@@ -9,9 +9,9 @@
def denoise(
X: ndarray,
y: ndarray,
- Xresampled: Optional[ndarray] = None,
- random_state: Optional[np.random.RandomState] = None,
-) -> Tuple[ndarray, ndarray]:
+ Xresampled: ndarray | None = None,
+ random_state: np.random.RandomState | None = None,
+) -> tuple[ndarray, ndarray]:
"""Denoise the dataset using a Gaussian process."""
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, ConstantKernel, WhiteKernel
@@ -31,8 +31,8 @@ def denoise(
def multi_denoise(
X: ndarray,
y: ndarray,
- Xresampled: Optional[ndarray] = None,
- random_state: Optional[np.random.RandomState] = None,
+ Xresampled: ndarray | None = None,
+ random_state: np.random.RandomState | None = None,
):
"""Perform `denoise` along each column of `y` independently."""
y = np.stack(
diff --git a/pysr/export.py b/pysr/export.py
new file mode 100644
index 000000000..c1b589b84
--- /dev/null
+++ b/pysr/export.py
@@ -0,0 +1,88 @@
+import copy
+from collections.abc import Callable
+
+import numpy as np
+import pandas as pd
+from numpy.typing import NDArray
+
+from .export_jax import sympy2jax
+from .export_numpy import sympy2numpy
+from .export_sympy import create_sympy_symbols, pysr2sympy
+from .export_torch import sympy2torch
+from .utils import ArrayLike
+
+
+def add_export_formats(
+ output: pd.DataFrame,
+ *,
+ feature_names_in: ArrayLike[str],
+ selection_mask: NDArray[np.bool_] | None = None,
+ extra_sympy_mappings: dict[str, Callable] | None = None,
+ extra_torch_mappings: dict[Callable, Callable] | None = None,
+ output_torch_format: bool = False,
+ extra_jax_mappings: dict[Callable, str] | None = None,
+ output_jax_format: bool = False,
+) -> pd.DataFrame:
+ """Create export formats for an equations dataframe.
+
+ Returns a new dataframe containing only the exported formats.
+ """
+ output = copy.deepcopy(output)
+
+ sympy_format = []
+ lambda_format = []
+ jax_format = []
+ torch_format = []
+
+ for _, eqn_row in output.iterrows():
+ eqn = pysr2sympy(
+ eqn_row["equation"],
+ feature_names_in=feature_names_in,
+ extra_sympy_mappings=extra_sympy_mappings,
+ )
+ sympy_format.append(eqn)
+
+ # NumPy:
+ sympy_symbols = create_sympy_symbols(feature_names_in)
+ lambda_format.append(
+ sympy2numpy(
+ eqn,
+ sympy_symbols,
+ selection=selection_mask,
+ )
+ )
+
+ # JAX:
+ if output_jax_format:
+ func, params = sympy2jax(
+ eqn,
+ sympy_symbols,
+ selection=selection_mask,
+ extra_jax_mappings=extra_jax_mappings,
+ )
+ jax_format.append({"callable": func, "parameters": params})
+
+ # Torch:
+ if output_torch_format:
+ module = sympy2torch(
+ eqn,
+ sympy_symbols,
+ selection=selection_mask,
+ extra_torch_mappings=extra_torch_mappings,
+ )
+ torch_format.append(module)
+
+ exports = pd.DataFrame(
+ {
+ "sympy_format": sympy_format,
+ "lambda_format": lambda_format,
+ },
+ index=output.index,
+ )
+
+ if output_jax_format:
+ exports["jax_format"] = jax_format
+ if output_torch_format:
+ exports["torch_format"] = torch_format
+
+ return exports
diff --git a/pysr/export_latex.py b/pysr/export_latex.py
index b7815d07c..55d24de0b 100644
--- a/pysr/export_latex.py
+++ b/pysr/export_latex.py
@@ -1,7 +1,5 @@
"""Functions to help export PySR equations to LaTeX."""
-from typing import List, Optional, Tuple
-
import pandas as pd
import sympy # type: ignore
from sympy.printing.latex import LatexPrinter # type: ignore
@@ -28,8 +26,8 @@ def sympy2latex(expr, prec=3, full_prec=True, **settings) -> str:
def generate_table_environment(
- columns: List[str] = ["equation", "complexity", "loss"]
-) -> Tuple[str, str]:
+ columns: list[str] = ["equation", "complexity", "loss"]
+) -> tuple[str, str]:
margins = "c" * len(columns)
column_map = {
"complexity": "Complexity",
@@ -61,9 +59,9 @@ def generate_table_environment(
def sympy2latextable(
equations: pd.DataFrame,
- indices: Optional[List[int]] = None,
+ indices: list[int] | None = None,
precision: int = 3,
- columns: List[str] = ["equation", "complexity", "loss", "score"],
+ columns: list[str] = ["equation", "complexity", "loss", "score"],
max_equation_length: int = 50,
output_variable_name: str = "y",
) -> str:
@@ -128,11 +126,11 @@ def sympy2latextable(
def sympy2multilatextable(
- equations: List[pd.DataFrame],
- indices: Optional[List[List[int]]] = None,
+ equations: list[pd.DataFrame],
+ indices: list[list[int]] | None = None,
precision: int = 3,
- columns: List[str] = ["equation", "complexity", "loss", "score"],
- output_variable_names: Optional[List[str]] = None,
+ columns: list[str] = ["equation", "complexity", "loss", "score"],
+ output_variable_names: list[str] | None = None,
) -> str:
"""Generate multiple latex tables for a list of equation sets."""
# TODO: Let user specify custom output variable
diff --git a/pysr/export_numpy.py b/pysr/export_numpy.py
index 4c1d12e73..163726d6f 100644
--- a/pysr/export_numpy.py
+++ b/pysr/export_numpy.py
@@ -1,7 +1,6 @@
"""Code for exporting discovered expressions to numpy"""
import warnings
-from typing import List, Union
import numpy as np
import pandas as pd
@@ -17,8 +16,8 @@ class CallableEquation:
"""Simple wrapper for numpy lambda functions built with sympy"""
_sympy: Expr
- _sympy_symbols: List[Symbol]
- _selection: Union[NDArray[np.bool_], None]
+ _sympy_symbols: list[Symbol]
+ _selection: NDArray[np.bool_] | None
def __init__(self, eqn, sympy_symbols, selection=None):
self._sympy = eqn
diff --git a/pysr/export_sympy.py b/pysr/export_sympy.py
index f38593413..ea54b01c6 100644
--- a/pysr/export_sympy.py
+++ b/pysr/export_sympy.py
@@ -1,6 +1,6 @@
"""Define utilities to export to sympy"""
-from typing import Callable, Dict, List, Optional
+from collections.abc import Callable
import sympy # type: ignore
from sympy import sympify
@@ -63,21 +63,21 @@
def create_sympy_symbols_map(
feature_names_in: ArrayLike[str],
-) -> Dict[str, sympy.Symbol]:
+) -> dict[str, sympy.Symbol]:
return {variable: sympy.Symbol(variable) for variable in feature_names_in}
def create_sympy_symbols(
feature_names_in: ArrayLike[str],
-) -> List[sympy.Symbol]:
+) -> list[sympy.Symbol]:
return [sympy.Symbol(variable) for variable in feature_names_in]
def pysr2sympy(
- equation: str,
+ equation: str | float | int,
*,
- feature_names_in: Optional[ArrayLike[str]] = None,
- extra_sympy_mappings: Optional[Dict[str, Callable]] = None,
+ feature_names_in: ArrayLike[str] | None = None,
+ extra_sympy_mappings: dict[str, Callable] | None = None,
):
if feature_names_in is None:
feature_names_in = []
diff --git a/pysr/expression_specs.py b/pysr/expression_specs.py
new file mode 100644
index 000000000..e86064785
--- /dev/null
+++ b/pysr/expression_specs.py
@@ -0,0 +1,266 @@
+import copy
+from abc import ABC, abstractmethod
+from typing import TYPE_CHECKING, Any, NewType, TypeAlias
+
+import numpy as np
+import pandas as pd
+
+from .export import add_export_formats
+from .julia_helpers import jl_array
+from .julia_import import AnyValue, SymbolicRegression, jl
+
+# For type checking purposes
+if TYPE_CHECKING:
+ from .sr import PySRRegressor # pragma: no cover
+
+ PySRRegressor: TypeAlias = PySRRegressor # pragma: no cover
+else:
+ PySRRegressor = NewType("PySRRegressor", Any)
+
+
+class AbstractExpressionSpec(ABC):
+ """Abstract base class describing expression types.
+
+ This basically just holds the options for the expression type,
+ as well as explains how to parse and evaluate them.
+
+ All expression types must implement:
+
+ 1. julia_expression_type(): The actual expression type, returned as a Julia object.
+ This will get stored as `expression_type` in `SymbolicRegression.Options`.
+ 2. julia_expression_options(): Method to create the expression options, returned as a Julia object.
+ These will get stored as `expression_options` in `SymbolicRegression.Options`.
+ 3. create_exports(), which will be used to create the exports of the equations, such as
+ the executable format, the SymPy format, etc.
+
+ It may also optionally implement:
+
+ - supports_sympy, supports_torch, supports_jax, supports_latex: Whether this expression type supports the corresponding export format.
+ """
+
+ @abstractmethod
+ def julia_expression_type(self) -> AnyValue:
+ """The expression type"""
+ pass # pragma: no cover
+
+ @abstractmethod
+ def julia_expression_options(self) -> AnyValue:
+ """The expression options"""
+ pass # pragma: no cover
+
+ @abstractmethod
+ def create_exports(
+ self,
+ model: PySRRegressor,
+ equations: pd.DataFrame,
+ search_output,
+ ) -> pd.DataFrame:
+ """Create additional columns in the equations dataframe."""
+ pass # pragma: no cover
+
+ @property
+ def evaluates_in_julia(self) -> bool:
+ return False
+
+ @property
+ def supports_sympy(self) -> bool:
+ return False
+
+ @property
+ def supports_torch(self) -> bool:
+ return False
+
+ @property
+ def supports_jax(self) -> bool:
+ return False
+
+ @property
+ def supports_latex(self) -> bool:
+ return False
+
+
+class ExpressionSpec(AbstractExpressionSpec):
+ """Options for the regular Expression expression type"""
+
+ def julia_expression_type(self):
+ return SymbolicRegression.Expression
+
+ def julia_expression_options(self):
+ return jl.NamedTuple()
+
+ def create_exports(
+ self,
+ model: PySRRegressor,
+ equations: pd.DataFrame,
+ search_output,
+ ):
+ return add_export_formats(
+ equations,
+ feature_names_in=model.feature_names_in_,
+ selection_mask=model.selection_mask_,
+ extra_sympy_mappings=model.extra_sympy_mappings,
+ extra_torch_mappings=model.extra_torch_mappings,
+ output_jax_format=model.output_jax_format,
+ extra_jax_mappings=model.extra_jax_mappings,
+ output_torch_format=model.output_torch_format,
+ )
+
+ @property
+ def supports_sympy(self):
+ return True
+
+ @property
+ def supports_torch(self):
+ return True
+
+ @property
+ def supports_jax(self):
+ return True
+
+ @property
+ def supports_latex(self):
+ return True
+
+
+class TemplateExpressionSpec(AbstractExpressionSpec):
+ """The structure of a template expression.
+
+ This class allows you to specify how multiple sub-expressions should be combined
+ in a structured way, with constraints on which variables each sub-expression can use.
+ Pass this to PySRRegressor with the `expression_options` argument when you are using
+ the `TemplateExpression` expression type.
+
+ Parameters
+ ----------
+ function_symbols : list[str]
+ List of symbols representing the inner expressions (e.g., ["f", "g"]).
+ These will be used as keys in the template structure.
+ combine : str
+ Julia function string that defines how the sub-expressions are combined.
+ Takes a NamedTuple of expressions and a tuple of data vectors.
+ For example: "((; f, g), (x1, x2, x3)) -> f(x1, x2) + g(x3)^2"
+ would constrain f to use x1,x2 and g to use x3.
+ num_features : dict[str, int]
+ Dictionary mapping function symbols to the number of features each can use.
+ For example: {"f": 2, "g": 1} means f takes 2 inputs and g takes 1.
+ If not provided, will be inferred from the combine function.
+
+ Examples
+ --------
+ ```python
+ # Create template that combines f(x1, x2) and g(x3):
+ template_options = TemplateExpressionSpec(
+ function_symbols=["f", "g"],
+ combine="((; f, g), (x1, x2, x3)) -> sin(f(x1, x2)) + g(x3)^2",
+ )
+
+ # Use in PySRRegressor:
+ model = PySRRegressor(
+ expression_options=template_options
+ )
+ ```
+ """
+
+ def __init__(
+ self,
+ function_symbols: list[str],
+ combine: str,
+ num_features: dict[str, int] | None = None,
+ ):
+ self.function_symbols = function_symbols
+ self.combine = combine
+ self.num_features = num_features
+
+ def julia_expression_type(self):
+ return SymbolicRegression.TemplateExpression
+
+ def julia_expression_options(self):
+ f_combine = jl.seval(self.combine)
+ creator = jl.seval(
+ """
+ function _pysr_create_template_structure(
+ @nospecialize(function_symbols::AbstractVector),
+ @nospecialize(combine::Function),
+ @nospecialize(num_features::Union{Nothing,AbstractDict})
+ )
+ tuple_symbol = (map(Symbol, function_symbols)..., )
+ num_features = if num_features === nothing
+ nothing
+ else
+ (; num_features...)
+ end
+ structure = SymbolicRegression.TemplateStructure{tuple_symbol}(combine, num_features)
+ return (; structure)
+ end
+ """
+ )
+ return creator(self.function_symbols, f_combine, self.num_features)
+
+ @property
+ def evaluates_in_julia(self):
+ return True
+
+ def create_exports(
+ self,
+ model: PySRRegressor,
+ equations: pd.DataFrame,
+ search_output,
+ ) -> pd.DataFrame:
+ # We try to load the raw julia state from a saved binary stream
+ # if not provided.
+ search_output = search_output or model.julia_state_
+ return _search_output_to_callable_expressions(equations, search_output)
+
+
+class ParametricExpressionSpec(AbstractExpressionSpec):
+ def __init__(self, max_parameters: int):
+ self.max_parameters = max_parameters
+
+ def julia_expression_type(self):
+ return SymbolicRegression.ParametricExpression
+
+ def julia_expression_options(self):
+ return jl.seval("NamedTuple{(:max_parameters,)}")((self.max_parameters,))
+
+ @property
+ def evaluates_in_julia(self):
+ return True
+
+ def create_exports(
+ self,
+ model: PySRRegressor,
+ equations: pd.DataFrame,
+ search_output,
+ ):
+ search_output = search_output or model.julia_state_
+ return _search_output_to_callable_expressions(equations, search_output)
+
+
+class CallableJuliaExpression:
+ def __init__(self, expression):
+ self.expression = expression
+
+ def __call__(self, X: np.ndarray, *args):
+ raw_output = self.expression(jl_array(X.T), *args)
+ return np.array(raw_output).T
+
+
+def _search_output_to_callable_expressions(
+ equations: pd.DataFrame, search_output
+) -> pd.DataFrame:
+ equations = copy.deepcopy(equations)
+ (_, out_hof) = search_output
+ expressions = []
+ callables = []
+
+ for _, row in equations.iterrows():
+ curComplexity = row["complexity"]
+ expression = out_hof.members[curComplexity - 1].tree
+ expressions.append(expression)
+ callables.append(CallableJuliaExpression(expression))
+
+ df = pd.DataFrame(
+ {"julia_expression": expressions, "lambda_format": callables},
+ index=equations.index,
+ )
+ return df
diff --git a/pysr/feature_selection.py b/pysr/feature_selection.py
index 7702e255a..8c8358fdd 100644
--- a/pysr/feature_selection.py
+++ b/pysr/feature_selection.py
@@ -1,6 +1,6 @@
"""Functions for doing feature selection during preprocessing."""
-from typing import Optional, cast
+from typing import cast
import numpy as np
from numpy import ndarray
@@ -13,7 +13,7 @@ def run_feature_selection(
X: ndarray,
y: ndarray,
select_k_features: int,
- random_state: Optional[np.random.RandomState] = None,
+ random_state: np.random.RandomState | None = None,
) -> NDArray[np.bool_]:
"""
Find most important features.
@@ -38,7 +38,7 @@ def run_feature_selection(
# Function has not been removed only due to usage in module tests
def _handle_feature_selection(
X: ndarray,
- select_k_features: Optional[int],
+ select_k_features: int | None,
y: ndarray,
variable_names: ArrayLike[str],
):
diff --git a/pysr/julia_extensions.py b/pysr/julia_extensions.py
index ac4714d48..72273f3e6 100644
--- a/pysr/julia_extensions.py
+++ b/pysr/julia_extensions.py
@@ -1,31 +1,39 @@
"""This file installs and loads extensions for SymbolicRegression."""
-from typing import Optional
+from typing import Literal
from .julia_import import Pkg, jl
+from .logger_specs import AbstractLoggerSpec, TensorBoardLoggerSpec
def load_required_packages(
*,
turbo: bool = False,
bumper: bool = False,
- enable_autodiff: bool = False,
- cluster_manager: Optional[str] = None,
+ autodiff_backend: Literal["Zygote"] | None = None,
+ cluster_manager: str | None = None,
+ logger_spec: AbstractLoggerSpec | None = None,
):
if turbo:
load_package("LoopVectorization", "bdcacae8-1622-11e9-2a5c-532679323890")
if bumper:
load_package("Bumper", "8ce10254-0962-460f-a3d8-1f77fea1446e")
- if enable_autodiff:
+ if autodiff_backend is not None:
load_package("Zygote", "e88e6eb3-aa80-5325-afca-941959d7151f")
if cluster_manager is not None:
load_package("ClusterManagers", "34f1f09b-3a8b-5176-ab39-66d58a4d544e")
+ if isinstance(logger_spec, TensorBoardLoggerSpec):
+ load_package("TensorBoardLogger", "899adc3e-224a-11e9-021f-63837185c80f")
def load_all_packages():
"""Install and load all Julia extensions available to PySR."""
load_required_packages(
- turbo=True, bumper=True, enable_autodiff=True, cluster_manager="slurm"
+ turbo=True,
+ bumper=True,
+ autodiff_backend="Zygote",
+ cluster_manager="slurm",
+ logger_spec=TensorBoardLoggerSpec(log_dir="logs"),
)
diff --git a/pysr/julia_helpers.py b/pysr/julia_helpers.py
index 18d4a6cf3..ef82be902 100644
--- a/pysr/julia_helpers.py
+++ b/pysr/julia_helpers.py
@@ -1,13 +1,13 @@
"""Functions for initializing the Julia environment and installing deps."""
-from typing import Any, Callable, Union, cast
+from typing import Any, Callable, cast, overload
import numpy as np
from juliacall import convert as jl_convert # type: ignore
from numpy.typing import NDArray
from .deprecated import init_julia, install
-from .julia_import import jl
+from .julia_import import AnyValue, jl
jl_convert = cast(Callable[[Any, Any], Any], jl_convert)
@@ -22,6 +22,8 @@
def _escape_filename(filename):
"""Turn a path into a string with correctly escaped backslashes."""
+ if filename is None:
+ return None
str_repr = str(filename)
str_repr = str_repr.replace("\\", "\\\\")
return str_repr
@@ -41,6 +43,10 @@ def jl_array(x, dtype=None):
return jl_convert(jl.Array[dtype], x)
+def jl_dict(x):
+ return jl_convert(jl.Dict, x)
+
+
def jl_is_function(f) -> bool:
return cast(bool, jl.seval("op -> op isa Function")(f))
@@ -51,7 +57,11 @@ def jl_serialize(obj: Any) -> NDArray[np.uint8]:
return np.array(jl.take_b(buf))
-def jl_deserialize(s: Union[NDArray[np.uint8], None]):
+@overload
+def jl_deserialize(s: NDArray[np.uint8]) -> AnyValue: ...
+@overload
+def jl_deserialize(s: None) -> None: ...
+def jl_deserialize(s):
if s is None:
return s
buf = jl.IOBuffer()
diff --git a/pysr/julia_import.py b/pysr/julia_import.py
index 0e032bee1..4d7b91504 100644
--- a/pysr/julia_import.py
+++ b/pysr/julia_import.py
@@ -42,6 +42,8 @@
# Deprecated; so just pass to juliacall
os.environ["PYTHON_JULIACALL_AUTOLOAD_IPYTHON_EXTENSION"] = autoload_extensions
+from juliacall import AnyValue # type: ignore
+from juliacall import VectorValue # type: ignore
from juliacall import Main as jl # type: ignore
jl = cast(ModuleType, jl)
diff --git a/pysr/juliapkg.json b/pysr/juliapkg.json
index 6b6e8aceb..35405a8ed 100644
--- a/pysr/juliapkg.json
+++ b/pysr/juliapkg.json
@@ -1,9 +1,9 @@
{
- "julia": "~1.6.7, ~1.7, ~1.8, ~1.9, =1.10.0, ~1.10.3",
+ "julia": "=1.10.0, 1.10.3",
"packages": {
"SymbolicRegression": {
"uuid": "8254be44-1295-4e6a-a16d-46603ac705cb",
- "version": "=0.24.5"
+ "version": "=1.0.3"
},
"Serialization": {
"uuid": "9e88b42a-f829-5b0c-bbe9-9e923198166b",
diff --git a/pysr/logger_specs.py b/pysr/logger_specs.py
new file mode 100644
index 000000000..885412405
--- /dev/null
+++ b/pysr/logger_specs.py
@@ -0,0 +1,76 @@
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+from typing import Any
+
+from .julia_helpers import jl_array, jl_dict
+from .julia_import import AnyValue, jl
+
+
+class AbstractLoggerSpec(ABC):
+ """Abstract base class for logger specifications."""
+
+ @abstractmethod
+ def create_logger(self) -> AnyValue:
+ """Create a logger instance."""
+ pass # pragma: no cover
+
+ @abstractmethod
+ def write_hparams(self, logger: AnyValue, hparams: dict[str, Any]) -> None:
+ """Write hyperparameters to the logger."""
+ pass # pragma: no cover
+
+
+@dataclass
+class TensorBoardLoggerSpec(AbstractLoggerSpec):
+ """Specification for TensorBoard logger.
+
+ Attributes:
+ ----------
+ log_dir : str
+ Directory where TensorBoard logs will be saved. If `overwrite` is `False`,
+ new logs will be saved to `{log_dir}_1`, and so on. Default is `"logs/run"`.
+ log_interval : int, optional
+ Interval (in steps) at which logs are written. Default is 10.
+ overwrite : bool, optional
+ Whether to overwrite existing logs in the directory. Default is False.
+ """
+
+ log_dir: str = "logs/run"
+ log_interval: int = 1
+ overwrite: bool = False
+
+ def create_logger(self) -> AnyValue:
+ # We assume that TensorBoardLogger is already imported via `julia_extensions.py`
+ make_logger = jl.seval(
+ """
+ function make_logger(log_dir::AbstractString, overwrite::Bool, log_interval::Int)
+ base_logger = TensorBoardLogger.TBLogger(
+ log_dir,
+ (overwrite ? (TensorBoardLogger.tb_overwrite,) : ())...
+ )
+ return SRLogger(; logger=base_logger, log_interval)
+ end
+ """
+ )
+ log_dir = str(self.log_dir)
+ return make_logger(log_dir, self.overwrite, self.log_interval)
+
+ def write_hparams(self, logger: AnyValue, hparams: dict[str, Any]) -> None:
+ base_logger = jl.SymbolicRegression.get_logger(logger)
+ writer = jl.seval("TensorBoardLogger.write_hparams!")
+ jl_clean_hparams = jl_dict(
+ {
+ k: (v if isinstance(v, (bool, int, float)) else str(v))
+ for k, v in hparams.items()
+ }
+ )
+ writer(
+ base_logger,
+ jl_clean_hparams,
+ jl_array(
+ [
+ "search/data/summaries/pareto_volume",
+ "search/data/summaries/min_loss",
+ ],
+ ),
+ )
diff --git a/pysr/param_groupings.yml b/pysr/param_groupings.yml
index 0ff9d63da..f43c67f94 100644
--- a/pysr/param_groupings.yml
+++ b/pysr/param_groupings.yml
@@ -2,6 +2,7 @@
- Creating the Search Space:
- binary_operators
- unary_operators
+ - expression_spec
- maxsize
- maxdepth
- Setting the Search Size:
@@ -22,6 +23,7 @@
- complexity_of_operators
- complexity_of_constants
- complexity_of_variables
+ - complexity_mapping
- warmup_maxsize_by
- use_frequency
- use_frequency_in_tournament
@@ -35,6 +37,7 @@
- weight_mutate_constant
- weight_mutate_operator
- weight_swap_operands
+ - weight_rotate_tree
- weight_randomize
- weight_simplify
- weight_optimize
@@ -42,6 +45,7 @@
- annealing
- alpha
- perturbation_factor
+ - probability_negate_constant
- skip_mutation_failures
- Tournament Selection:
- tournament_selection_n
@@ -49,6 +53,7 @@
- Constant Optimization:
- optimizer_algorithm
- optimizer_nrestarts
+ - optimizer_f_calls_limit
- optimize_probability
- optimizer_iterations
- should_optimize_constants
@@ -66,8 +71,8 @@
- timeout_in_seconds
- early_stop_condition
- Performance and Parallelization:
+ - parallelism
- procs
- - multithreading
- cluster_manager
- heap_size_hint_in_bytes
- batching
@@ -76,7 +81,7 @@
- fast_cycle
- turbo
- bumper
- - enable_autodiff
+ - autodiff_backend
- Determinism:
- random_state
- deterministic
@@ -86,13 +91,15 @@
- update_verbosity
- print_precision
- progress
+ - logger_spec
- Environment:
- temp_equation_file
- tempdir
- delete_tempfiles
- update
- Exporting the Results:
- - equation_file
+ - output_directory
+ - run_id
- output_jax_format
- output_torch_format
- extra_sympy_mappings
diff --git a/pysr/sr.py b/pysr/sr.py
index 0054ce502..aa9492e2a 100644
--- a/pysr/sr.py
+++ b/pysr/sr.py
@@ -4,16 +4,15 @@
import os
import pickle as pkl
import re
-import shutil
import sys
import tempfile
import warnings
+from collections.abc import Callable
from dataclasses import dataclass, fields
-from datetime import datetime
from io import StringIO
from multiprocessing import cpu_count
from pathlib import Path
-from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union, cast
+from typing import Any, Literal, cast
import numpy as np
import pandas as pd
@@ -26,20 +25,21 @@
from .denoising import denoise, multi_denoise
from .deprecated import DEPRECATED_KWARGS
-from .export_jax import sympy2jax
from .export_latex import (
sympy2latex,
sympy2latextable,
sympy2multilatextable,
with_preamble,
)
-from .export_numpy import sympy2numpy
-from .export_sympy import assert_valid_sympy_symbol, create_sympy_symbols, pysr2sympy
-from .export_torch import sympy2torch
+from .export_sympy import assert_valid_sympy_symbol
+from .expression_specs import (
+ AbstractExpressionSpec,
+ ExpressionSpec,
+ ParametricExpressionSpec,
+)
from .feature_selection import run_feature_selection
from .julia_extensions import load_required_packages
from .julia_helpers import (
- PythonCall,
_escape_filename,
_load_cluster_manager,
jl_array,
@@ -47,11 +47,11 @@
jl_is_function,
jl_serialize,
)
-from .julia_import import SymbolicRegression, jl
+from .julia_import import AnyValue, SymbolicRegression, VectorValue, jl
+from .logger_specs import AbstractLoggerSpec
from .utils import (
ArrayLike,
PathLike,
- _csv_filename_to_pkl_filename,
_preprocess_julia_floats,
_safe_check_feature_names_in,
_subscriptify,
@@ -61,7 +61,11 @@
ALREADY_RAN = False
-def _process_constraints(binary_operators, unary_operators, constraints):
+def _process_constraints(
+ binary_operators: list[str],
+ unary_operators: list,
+ constraints: dict[str, int | tuple[int, int]],
+) -> dict[str, int | tuple[int, int]]:
constraints = constraints.copy()
for op in unary_operators:
if op not in constraints:
@@ -76,30 +80,31 @@ def _process_constraints(binary_operators, unary_operators, constraints):
"One typical constraint is to use `constraints={..., '^': (-1, 1)}`, which "
"will allow arbitrary-complexity base (-1) but only powers such as "
"a constant or variable (1). "
- "For more tips, please see https://astroautomata.com/PySR/tuning/"
+ "For more tips, please see https://ai.damtp.cam.ac.uk/pysr/tuning/"
)
constraints[op] = (-1, -1)
+
+ constraint_tuple = cast(tuple[int, int], constraints[op])
if op in ["plus", "sub", "+", "-"]:
- if constraints[op][0] != constraints[op][1]:
+ if constraint_tuple[0] != constraint_tuple[1]:
raise NotImplementedError(
"You need equal constraints on both sides for - and +, "
"due to simplification strategies."
)
elif op in ["mult", "*"]:
# Make sure the complex expression is in the left side.
- if constraints[op][0] == -1:
+ if constraint_tuple[0] == -1:
continue
- if constraints[op][1] == -1 or constraints[op][0] < constraints[op][1]:
- constraints[op][0], constraints[op][1] = (
- constraints[op][1],
- constraints[op][0],
- )
+ if constraint_tuple[1] == -1 or constraint_tuple[0] < constraint_tuple[1]:
+ constraints[op] = (constraint_tuple[1], constraint_tuple[0])
return constraints
def _maybe_create_inline_operators(
- binary_operators, unary_operators, extra_sympy_mappings
-):
+ binary_operators: list[str],
+ unary_operators: list[str],
+ extra_sympy_mappings: dict[str, Callable] | None,
+) -> tuple[list[str], list[str]]:
binary_operators = binary_operators.copy()
unary_operators = unary_operators.copy()
for op_list in [binary_operators, unary_operators]:
@@ -190,6 +195,25 @@ def _check_assertions(
)
+def _validate_export_mappings(extra_jax_mappings, extra_torch_mappings):
+ # It is expected extra_jax/torch_mappings will be updated after fit.
+ # Thus, validation is performed here instead of in _validate_init_params
+ if extra_jax_mappings is not None:
+ for value in extra_jax_mappings.values():
+ if not isinstance(value, str):
+ raise ValueError(
+ "extra_jax_mappings must have keys that are strings! "
+ "e.g., {sympy.sqrt: 'jnp.sqrt'}."
+ )
+ if extra_torch_mappings is not None:
+ for value in extra_torch_mappings.values():
+ if not callable(value):
+ raise ValueError(
+ "extra_torch_mappings must be callable functions! "
+ "e.g., {sympy.sqrt: torch.sqrt}."
+ )
+
+
# Class validation constants
VALID_OPTIMIZER_ALGORITHMS = ["BFGS", "NelderMead"]
@@ -198,11 +222,10 @@ def _check_assertions(
class _DynamicallySetParams:
"""Defines some parameters that are set at runtime."""
- binary_operators: List[str]
- unary_operators: List[str]
+ binary_operators: list[str]
+ unary_operators: list[str]
maxdepth: int
- constraints: Dict[str, str]
- multithreading: bool
+ constraints: dict[str, int | tuple[int, int]]
batch_size: int
update_verbosity: int
progress: bool
@@ -221,7 +244,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
Most default parameters have been tuned over several example equations,
but you should adjust `niterations`, `binary_operators`, `unary_operators`
to your requirements. You can view more detailed explanations of the options
- on the [options page](https://astroautomata.com/PySR/options) of the
+ on the [options page](https://ai.damtp.cam.ac.uk/pysr/options) of the
documentation.
Parameters
@@ -241,29 +264,35 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
most accurate model.
binary_operators : list[str]
List of strings for binary operators used in the search.
- See the [operators page](https://astroautomata.com/PySR/operators/)
+ See the [operators page](https://ai.damtp.cam.ac.uk/pysr/operators/)
for more details.
Default is `["+", "-", "*", "/"]`.
unary_operators : list[str]
Operators which only take a single scalar as input.
For example, `"cos"` or `"exp"`.
Default is `None`.
+ expression_spec : AbstractExpressionSpec
+ The type of expression to search for. By default,
+ this is just `ExpressionSpec()`. You can also use
+ `TemplateExpressionSpec(...)` which allows you to specify
+ a custom template for the expressions.
+ Default is `ExpressionSpec()`.
niterations : int
Number of iterations of the algorithm to run. The best
equations are printed and migrate between populations at the
end of each iteration.
- Default is `40`.
+ Default is `100`.
populations : int
Number of populations running.
- Default is `15`.
+ Default is `31`.
population_size : int
Number of individuals in each population.
- Default is `33`.
+ Default is `27`.
max_evals : int
Limits the total number of evaluations of expressions to
this number. Default is `None`.
maxsize : int
- Max complexity of an equation. Default is `20`.
+ Max complexity of an equation. Default is `30`.
maxdepth : int
Max depth of an equation. You can use both `maxsize` and
`maxdepth`. `maxdepth` is by default not used.
@@ -340,7 +369,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
`idx` argument to the function, which is `nothing`
for non-batched, and a 1D array of indices for batched.
Default is `None`.
- complexity_of_operators : dict[str, Union[int, float]]
+ complexity_of_operators : dict[str, int | float]
If you would like to use a complexity other than 1 for an
operator, specify the complexity here. For example,
`{"sin": 2, "+": 1}` would give a complexity of 2 for each use
@@ -351,14 +380,20 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
Default is `None`.
complexity_of_constants : int | float
Complexity of constants. Default is `1`.
- complexity_of_variables : int | float
+ complexity_of_variables : int | float | list[int | float]
Global complexity of variables. To set different complexities for
different variables, pass a list of complexities to the `fit` method
with keyword `complexity_of_variables`. You cannot use both.
Default is `1`.
+ complexity_mapping : str
+ Alternatively, you can pass a function (a string of Julia code) that
+ takes the expression as input and returns the complexity. Make sure that
+ this operates on `AbstractExpression` (and unpacks to `AbstractExpressionNode`),
+ and returns an integer.
+ Default is `None`.
parsimony : float
Multiplicative factor for how much to punish complexity.
- Default is `0.0032`.
+ Default is `0.0`.
dimensional_constraint_penalty : float
Additive penalty for if dimensional analysis of an expression fails.
By default, this is `1000.0`.
@@ -380,11 +415,11 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
weight the contribution. If you find that the search is only optimizing
the most complex expressions while the simpler expressions remain stagnant,
you should increase this value.
- Default is `20.0`.
+ Default is `1040.0`.
alpha : float
Initial temperature for simulated annealing
(requires `annealing` to be `True`).
- Default is `0.1`.
+ Default is `3.17`.
annealing : bool
Whether to use annealing. Default is `False`.
early_stop_condition : float | str
@@ -396,43 +431,46 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
ncycles_per_iteration : int
Number of total mutations to run, per 10 samples of the
population, per iteration.
- Default is `550`.
+ Default is `380`.
fraction_replaced : float
How much of population to replace with migrating equations from
other populations.
- Default is `0.000364`.
+ Default is `0.00036`.
fraction_replaced_hof : float
How much of population to replace with migrating equations from
- hall of fame. Default is `0.035`.
+ hall of fame. Default is `0.0614`.
weight_add_node : float
Relative likelihood for mutation to add a node.
- Default is `0.79`.
+ Default is `2.47`.
weight_insert_node : float
Relative likelihood for mutation to insert a node.
- Default is `5.1`.
+ Default is `0.0112`.
weight_delete_node : float
Relative likelihood for mutation to delete a node.
- Default is `1.7`.
+ Default is `0.870`.
weight_do_nothing : float
Relative likelihood for mutation to leave the individual.
- Default is `0.21`.
+ Default is `0.273`.
weight_mutate_constant : float
Relative likelihood for mutation to change the constant slightly
in a random direction.
- Default is `0.048`.
+ Default is `0.0346`.
weight_mutate_operator : float
Relative likelihood for mutation to swap an operator.
- Default is `0.47`.
+ Default is `0.293`.
weight_swap_operands : float
Relative likehood for swapping operands in binary operators.
- Default is `0.1`.
+ Default is `0.198`.
+ weight_rotate_tree : float
+ How often to perform a tree rotation at a random node.
+ Default is `4.26`.
weight_randomize : float
Relative likelihood for mutation to completely delete and then
randomly generate the equation
- Default is `0.00023`.
+ Default is `0.000502`.
weight_simplify : float
Relative likelihood for mutation to simplify constant parts by evaluation
- Default is `0.0020`.
+ Default is `0.00209`.
weight_optimize: float
Constant optimization can also be performed as a mutation, in addition to
the normal strategy controlled by `optimize_probability` which happens
@@ -441,7 +479,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
Default is `0.0`.
crossover_probability : float
Absolute probability of crossover-type genetic operation, instead of a mutation.
- Default is `0.066`.
+ Default is `0.0259`.
skip_mutation_failures : bool
Whether to skip mutation and crossover failures, rather than
simply re-sampling the current member.
@@ -467,6 +505,9 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
Number of time to restart the constants optimization process with
different initial conditions.
Default is `2`.
+ optimizer_f_calls_limit : int
+ How many function calls to allow during optimization.
+ Default is `10_000`.
optimize_probability : float
Probability of optimizing the constants during a single iteration of
the evolutionary algorithm.
@@ -478,21 +519,24 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
Constants are perturbed by a max factor of
(perturbation_factor*T + 1). Either multiplied by this or
divided by this.
- Default is `0.076`.
+ Default is `0.129`.
+ probability_negate_constant : float
+ Probability of negating a constant in the equation when mutating it.
+ Default is `0.00743`.
tournament_selection_n : int
Number of expressions to consider in each tournament.
- Default is `10`.
+ Default is `15`.
tournament_selection_p : float
Probability of selecting the best expression in each
tournament. The probability will decay as p*(1-p)^n for other
expressions, sorted by loss.
- Default is `0.86`.
- procs : int
- Number of processes (=number of populations running).
- Default is `cpu_count()`.
- multithreading : bool
- Use multithreading instead of distributed backend.
- Using procs=0 will turn off both. Default is `True`.
+ Default is `0.982`.
+ parallelism: Literal["serial", "multithreading", "multiprocessing"] | None
+ Parallelism to use for the search. Can be `"serial"`, `"multithreading"`, or `"multiprocessing"`.
+ Default is `"multithreading"`.
+ procs: int | None
+ Number of processes to use for parallelism. If `None`, defaults to `cpu_count()`.
+ Default is `None`.
cluster_manager : str
For distributed computing, this sets the job queue system. Set
to one of "slurm", "pbs", "lsf", "sge", "qrsh", "scyld", or
@@ -532,11 +576,11 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
If you pass complex data, the corresponding complex precision
will be used (i.e., `64` for complex128, `32` for complex64).
Default is `32`.
- enable_autodiff : bool
- Whether to create derivative versions of operators for automatic
- differentiation. This is only necessary if you wish to compute
- the gradients of an expression within a custom loss function.
- Default is `False`.
+ autodiff_backend : Literal["Zygote"] | None
+ Which backend to use for automatic differentiation during constant
+ optimization. Currently only `"Zygote"` is supported. The default,
+ `None`, uses forward-mode or finite difference.
+ Default is `None`.
random_state : int, Numpy RandomState instance or None
Pass an int for reproducible results across multiple function calls.
See :term:`Glossary `.
@@ -544,7 +588,7 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
deterministic : bool
Make a PySR search give the same result every run.
To use this, you must turn off parallelism
- (with `procs`=0, `multithreading`=False),
+ (with `parallelism="serial"`),
and set `random_state` to a fixed seed.
Default is `False`.
warm_start : bool
@@ -563,8 +607,18 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
progress : bool
Whether to use a progress bar instead of printing to stdout.
Default is `True`.
- equation_file : str
- Where to save the files (.csv extension).
+ logger_spec: AbstractLoggerSpec | None
+ Logger specification for the Julia backend. See, for example,
+ `TensorBoardLoggerSpec`.
+ Default is `None`.
+ run_id : str
+ A unique identifier for the run. Will be generated using the
+ current date and time if not provided.
+ Default is `None`.
+ output_directory : str
+ The base directory to save output files to. Files
+ will be saved in a subdirectory according to the run ID.
+ Will be set to `outputs/` if not provided.
Default is `None`.
temp_equation_file : bool
Whether to put the hall of fame file in the temp directory.
@@ -642,15 +696,16 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
Number of output dimensions.
selection_mask_ : ndarray of shape (`n_features_in_`,)
Mask of which features of `X` to use when `select_k_features` is set.
- tempdir_ : Path
+ tempdir_ : Path | None
Path to the temporary equations directory.
- equation_file_ : Union[str, Path]
- Output equation file name produced by the julia backend.
julia_state_stream_ : ndarray
The serialized state for the julia SymbolicRegression.jl backend (after fitting),
stored as an array of uint8, produced by Julia's Serialization.serialize function.
julia_options_stream_ : ndarray
The serialized julia options, stored as an array of uint8,
+ expression_spec_ : AbstractExpressionSpec
+ The expression specification used for this fit. This is equal to
+ `self.expression_spec` if provided, or `ExpressionSpec()` otherwise.
equation_file_contents_ : list[pandas.DataFrame]
Contents of the equation file output by the Julia backend.
show_pickle_warnings_ : bool
@@ -697,111 +752,120 @@ class PySRRegressor(MultiOutputMixin, RegressorMixin, BaseEstimator):
```
"""
- equations_: Union[pd.DataFrame, List[pd.DataFrame], None]
+ equations_: pd.DataFrame | list[pd.DataFrame] | None
n_features_in_: int
feature_names_in_: ArrayLike[str]
display_feature_names_in_: ArrayLike[str]
- complexity_of_variables_: Union[int, float, List[Union[int, float]], None]
- X_units_: Union[ArrayLike[str], None]
- y_units_: Union[str, ArrayLike[str], None]
+ complexity_of_variables_: int | float | list[int | float] | None
+ X_units_: ArrayLike[str] | None
+ y_units_: str | ArrayLike[str] | None
nout_: int
- selection_mask_: Union[NDArray[np.bool_], None]
- tempdir_: Path
- equation_file_: PathLike
- julia_state_stream_: Union[NDArray[np.uint8], None]
- julia_options_stream_: Union[NDArray[np.uint8], None]
- equation_file_contents_: Union[List[pd.DataFrame], None]
+ selection_mask_: NDArray[np.bool_] | None
+ run_id_: str
+ output_directory_: str
+ julia_state_stream_: NDArray[np.uint8] | None
+ julia_options_stream_: NDArray[np.uint8] | None
+ equation_file_contents_: list[pd.DataFrame] | None
show_pickle_warnings_: bool
def __init__(
self,
model_selection: Literal["best", "accuracy", "score"] = "best",
*,
- binary_operators: Optional[List[str]] = None,
- unary_operators: Optional[List[str]] = None,
- niterations: int = 40,
- populations: int = 15,
- population_size: int = 33,
- max_evals: Optional[int] = None,
- maxsize: int = 20,
- maxdepth: Optional[int] = None,
- warmup_maxsize_by: Optional[float] = None,
- timeout_in_seconds: Optional[float] = None,
- constraints: Optional[Dict[str, Union[int, Tuple[int, int]]]] = None,
- nested_constraints: Optional[Dict[str, Dict[str, int]]] = None,
- elementwise_loss: Optional[str] = None,
- loss_function: Optional[str] = None,
- complexity_of_operators: Optional[Dict[str, Union[int, float]]] = None,
- complexity_of_constants: Union[int, float] = 1,
- complexity_of_variables: Optional[Union[int, float]] = None,
- parsimony: float = 0.0032,
- dimensional_constraint_penalty: Optional[float] = None,
+ binary_operators: list[str] | None = None,
+ unary_operators: list[str] | None = None,
+ expression_spec: AbstractExpressionSpec | None = None,
+ niterations: int = 100,
+ populations: int = 31,
+ population_size: int = 27,
+ max_evals: int | None = None,
+ maxsize: int = 30,
+ maxdepth: int | None = None,
+ warmup_maxsize_by: float | None = None,
+ timeout_in_seconds: float | None = None,
+ constraints: dict[str, int | tuple[int, int]] | None = None,
+ nested_constraints: dict[str, dict[str, int]] | None = None,
+ elementwise_loss: str | None = None,
+ loss_function: str | None = None,
+ complexity_of_operators: dict[str, int | float] | None = None,
+ complexity_of_constants: int | float | None = None,
+ complexity_of_variables: int | float | list[int | float] | None = None,
+ complexity_mapping: str | None = None,
+ parsimony: float = 0.0,
+ dimensional_constraint_penalty: float | None = None,
dimensionless_constants_only: bool = False,
use_frequency: bool = True,
use_frequency_in_tournament: bool = True,
- adaptive_parsimony_scaling: float = 20.0,
- alpha: float = 0.1,
+ adaptive_parsimony_scaling: float = 1040.0,
+ alpha: float = 3.17,
annealing: bool = False,
- early_stop_condition: Optional[Union[float, str]] = None,
- ncycles_per_iteration: int = 550,
- fraction_replaced: float = 0.000364,
- fraction_replaced_hof: float = 0.035,
- weight_add_node: float = 0.79,
- weight_insert_node: float = 5.1,
- weight_delete_node: float = 1.7,
- weight_do_nothing: float = 0.21,
- weight_mutate_constant: float = 0.048,
- weight_mutate_operator: float = 0.47,
- weight_swap_operands: float = 0.1,
- weight_randomize: float = 0.00023,
- weight_simplify: float = 0.0020,
+ early_stop_condition: float | str | None = None,
+ ncycles_per_iteration: int = 380,
+ fraction_replaced: float = 0.00036,
+ fraction_replaced_hof: float = 0.0614,
+ weight_add_node: float = 2.47,
+ weight_insert_node: float = 0.0112,
+ weight_delete_node: float = 0.870,
+ weight_do_nothing: float = 0.273,
+ weight_mutate_constant: float = 0.0346,
+ weight_mutate_operator: float = 0.293,
+ weight_swap_operands: float = 0.198,
+ weight_rotate_tree: float = 4.26,
+ weight_randomize: float = 0.000502,
+ weight_simplify: float = 0.00209,
weight_optimize: float = 0.0,
- crossover_probability: float = 0.066,
+ crossover_probability: float = 0.0259,
skip_mutation_failures: bool = True,
migration: bool = True,
hof_migration: bool = True,
topn: int = 12,
- should_simplify: Optional[bool] = None,
+ should_simplify: bool = True,
should_optimize_constants: bool = True,
optimizer_algorithm: Literal["BFGS", "NelderMead"] = "BFGS",
optimizer_nrestarts: int = 2,
+ optimizer_f_calls_limit: int | None = None,
optimize_probability: float = 0.14,
optimizer_iterations: int = 8,
- perturbation_factor: float = 0.076,
- tournament_selection_n: int = 10,
- tournament_selection_p: float = 0.86,
- procs: int = cpu_count(),
- multithreading: Optional[bool] = None,
- cluster_manager: Optional[
- Literal["slurm", "pbs", "lsf", "sge", "qrsh", "scyld", "htc"]
- ] = None,
- heap_size_hint_in_bytes: Optional[int] = None,
+ perturbation_factor: float = 0.129,
+ probability_negate_constant: float = 0.00743,
+ tournament_selection_n: int = 15,
+ tournament_selection_p: float = 0.982,
+ parallelism: (
+ Literal["serial", "multithreading", "multiprocessing"] | None
+ ) = None,
+ procs: int | None = None,
+ cluster_manager: (
+ Literal["slurm", "pbs", "lsf", "sge", "qrsh", "scyld", "htc"] | None
+ ) = None,
+ heap_size_hint_in_bytes: int | None = None,
batching: bool = False,
batch_size: int = 50,
fast_cycle: bool = False,
turbo: bool = False,
bumper: bool = False,
- precision: int = 32,
- enable_autodiff: bool = False,
- random_state=None,
+ precision: Literal[16, 32, 64] = 32,
+ autodiff_backend: Literal["Zygote"] | None = None,
+ random_state: int | np.random.RandomState | None = None,
deterministic: bool = False,
warm_start: bool = False,
verbosity: int = 1,
- update_verbosity: Optional[int] = None,
+ update_verbosity: int | None = None,
print_precision: int = 5,
progress: bool = True,
- equation_file: Optional[str] = None,
+ logger_spec: AbstractLoggerSpec | None = None,
+ run_id: str | None = None,
+ output_directory: str | None = None,
temp_equation_file: bool = False,
- tempdir: Optional[str] = None,
+ tempdir: str | None = None,
delete_tempfiles: bool = True,
update: bool = False,
output_jax_format: bool = False,
output_torch_format: bool = False,
- extra_sympy_mappings: Optional[Dict[str, Callable]] = None,
- extra_torch_mappings: Optional[Dict[Callable, Callable]] = None,
- extra_jax_mappings: Optional[Dict[Callable, str]] = None,
+ extra_sympy_mappings: dict[str, Callable] | None = None,
+ extra_torch_mappings: dict[Callable, Callable] | None = None,
+ extra_jax_mappings: dict[Callable, str] | None = None,
denoise: bool = False,
- select_k_features: Optional[int] = None,
+ select_k_features: int | None = None,
**kwargs,
):
# Hyperparameters
@@ -809,6 +873,7 @@ def __init__(
self.model_selection = model_selection
self.binary_operators = binary_operators
self.unary_operators = unary_operators
+ self.expression_spec = expression_spec
self.niterations = niterations
self.populations = populations
self.population_size = population_size
@@ -830,6 +895,7 @@ def __init__(
self.complexity_of_operators = complexity_of_operators
self.complexity_of_constants = complexity_of_constants
self.complexity_of_variables = complexity_of_variables
+ self.complexity_mapping = complexity_mapping
self.parsimony = parsimony
self.dimensional_constraint_penalty = dimensional_constraint_penalty
self.dimensionless_constants_only = dimensionless_constants_only
@@ -847,6 +913,7 @@ def __init__(
self.weight_mutate_constant = weight_mutate_constant
self.weight_mutate_operator = weight_mutate_operator
self.weight_swap_operands = weight_swap_operands
+ self.weight_rotate_tree = weight_rotate_tree
self.weight_randomize = weight_randomize
self.weight_simplify = weight_simplify
self.weight_optimize = weight_optimize
@@ -862,15 +929,17 @@ def __init__(
self.should_optimize_constants = should_optimize_constants
self.optimizer_algorithm = optimizer_algorithm
self.optimizer_nrestarts = optimizer_nrestarts
+ self.optimizer_f_calls_limit = optimizer_f_calls_limit
self.optimize_probability = optimize_probability
self.optimizer_iterations = optimizer_iterations
self.perturbation_factor = perturbation_factor
+ self.probability_negate_constant = probability_negate_constant
# -- Selection parameters
self.tournament_selection_n = tournament_selection_n
self.tournament_selection_p = tournament_selection_p
# -- Performance parameters
+ self.parallelism = parallelism
self.procs = procs
- self.multithreading = multithreading
self.cluster_manager = cluster_manager
self.heap_size_hint_in_bytes = heap_size_hint_in_bytes
self.batching = batching
@@ -879,7 +948,7 @@ def __init__(
self.turbo = turbo
self.bumper = bumper
self.precision = precision
- self.enable_autodiff = enable_autodiff
+ self.autodiff_backend = autodiff_backend
self.random_state = random_state
self.deterministic = deterministic
self.warm_start = warm_start
@@ -889,8 +958,10 @@ def __init__(
self.update_verbosity = update_verbosity
self.print_precision = print_precision
self.progress = progress
+ self.logger_spec = logger_spec
# - Project management
- self.equation_file = equation_file
+ self.run_id = run_id
+ self.output_directory = output_directory
self.temp_equation_file = temp_equation_file
self.tempdir = tempdir
self.delete_tempfiles = delete_tempfiles
@@ -917,6 +988,9 @@ def __init__(
"Please use that instead.",
FutureWarning,
)
+ elif k == "multithreading":
+ # Specific advice given in `_map_parallelism_params`
+ self.multithreading: bool | None = v
# Handle kwargs that have been moved to the fit method
elif k in ["weights", "variable_names", "Xresampled"]:
warnings.warn(
@@ -927,7 +1001,7 @@ def __init__(
elif k == "julia_project":
warnings.warn(
"The `julia_project` parameter has been deprecated. To use a custom "
- "julia project, please see `https://astroautomata.com/PySR/backend`.",
+ "julia project, please see `https://ai.damtp.cam.ac.uk/pysr/backend`.",
FutureWarning,
)
elif k == "julia_kwargs":
@@ -949,24 +1023,26 @@ def __init__(
@classmethod
def from_file(
cls,
- equation_file: PathLike,
+ equation_file: None = None, # Deprecated
*,
- binary_operators: Optional[List[str]] = None,
- unary_operators: Optional[List[str]] = None,
- n_features_in: Optional[int] = None,
- feature_names_in: Optional[ArrayLike[str]] = None,
- selection_mask: Optional[NDArray[np.bool_]] = None,
+ run_directory: PathLike,
+ binary_operators: list[str] | None = None,
+ unary_operators: list[str] | None = None,
+ n_features_in: int | None = None,
+ feature_names_in: ArrayLike[str] | None = None,
+ selection_mask: NDArray[np.bool_] | None = None,
nout: int = 1,
**pysr_kwargs,
- ):
+ ) -> "PySRRegressor":
"""
Create a model from a saved model checkpoint or equation file.
Parameters
----------
- equation_file : str or Path
- Path to a pickle file containing a saved model, or a csv file
- containing equations.
+ run_directory : str
+ The directory containing outputs from a previous run.
+ This is of the form `[output_directory]/[run_id]`.
+ Default is `None`.
binary_operators : list[str]
The same binary operators used when creating the model.
Not needed if loading from a pickle file.
@@ -996,70 +1072,74 @@ def from_file(
model : PySRRegressor
The model with fitted equations.
"""
+ if equation_file is not None:
+ raise ValueError(
+ "Passing `equation_file` is deprecated and no longer compatible with "
+ "the most recent versions of PySR's backend. Please pass `run_directory` "
+ "instead, which contains all checkpoint files."
+ )
- pkl_filename = _csv_filename_to_pkl_filename(equation_file)
-
- # Try to load model from .pkl
- print(f"Checking if {pkl_filename} exists...")
- if os.path.exists(pkl_filename):
- print(f"Loading model from {pkl_filename}")
+ pkl_filename = Path(run_directory) / "checkpoint.pkl"
+ if pkl_filename.exists():
+ print(f"Attempting to load model from {pkl_filename}...")
assert binary_operators is None
assert unary_operators is None
assert n_features_in is None
with open(pkl_filename, "rb") as f:
- model = pkl.load(f)
- # Change equation_file_ to be in the same dir as the pickle file
- base_dir = os.path.dirname(pkl_filename)
- base_equation_file = os.path.basename(model.equation_file_)
- model.equation_file_ = os.path.join(base_dir, base_equation_file)
+ model = cast("PySRRegressor", pkl.load(f))
# Update any parameters if necessary, such as
# extra_sympy_mappings:
model.set_params(**pysr_kwargs)
+
if "equations_" not in model.__dict__ or model.equations_ is None:
model.refresh()
return model
-
- # Else, we re-create it.
- print(
- f"{pkl_filename} does not exist, "
- "so we must create the model from scratch."
- )
- assert binary_operators is not None or unary_operators is not None
- assert n_features_in is not None
-
- # TODO: copy .bkup file if exists.
- model = cls(
- equation_file=str(equation_file),
- binary_operators=binary_operators,
- unary_operators=unary_operators,
- **pysr_kwargs,
- )
-
- model.nout_ = nout
- model.n_features_in_ = n_features_in
-
- if feature_names_in is None:
- model.feature_names_in_ = np.array([f"x{i}" for i in range(n_features_in)])
- model.display_feature_names_in_ = np.array(
- [f"x{_subscriptify(i)}" for i in range(n_features_in)]
- )
else:
- assert len(feature_names_in) == n_features_in
- model.feature_names_in_ = feature_names_in
- model.display_feature_names_in_ = feature_names_in
+ print(
+ f"Checkpoint file {pkl_filename} does not exist. "
+ "Attempting to recreate model from scratch..."
+ )
+ csv_filename = Path(run_directory) / "hall_of_fame.csv"
+ csv_filename_bak = Path(run_directory) / "hall_of_fame.csv.bak"
+ if not csv_filename.exists() and not csv_filename_bak.exists():
+ raise FileNotFoundError(
+ f"Hall of fame file `{csv_filename}` or `{csv_filename_bak}` does not exist. "
+ "Please pass a `run_directory` containing a valid checkpoint file."
+ )
+ assert binary_operators is not None or unary_operators is not None
+ assert n_features_in is not None
+ model = cls(
+ binary_operators=binary_operators,
+ unary_operators=unary_operators,
+ **pysr_kwargs,
+ )
+ model.nout_ = nout
+ model.n_features_in_ = n_features_in
- if selection_mask is None:
- model.selection_mask_ = np.ones(n_features_in, dtype=np.bool_)
- else:
- model.selection_mask_ = selection_mask
+ if feature_names_in is None:
+ model.feature_names_in_ = np.array(
+ [f"x{i}" for i in range(n_features_in)]
+ )
+ model.display_feature_names_in_ = np.array(
+ [f"x{_subscriptify(i)}" for i in range(n_features_in)]
+ )
+ else:
+ assert len(feature_names_in) == n_features_in
+ model.feature_names_in_ = feature_names_in
+ model.display_feature_names_in_ = feature_names_in
- model.refresh(checkpoint_file=equation_file)
+ if selection_mask is None:
+ model.selection_mask_ = np.ones(n_features_in, dtype=np.bool_)
+ else:
+ model.selection_mask_ = selection_mask
- return model
+ model.refresh(run_directory=run_directory)
- def __repr__(self):
+ return model
+
+ def __repr__(self) -> str:
"""
Print all current equations fitted by the model.
@@ -1106,7 +1186,7 @@ def __repr__(self):
output += "]"
return output
- def __getstate__(self):
+ def __getstate__(self) -> dict[str, Any]:
"""
Handle pickle serialization for PySRRegressor.
@@ -1163,10 +1243,18 @@ def _checkpoint(self):
"""
# Save model state:
self.show_pickle_warnings_ = False
- with open(_csv_filename_to_pkl_filename(self.equation_file_), "wb") as f:
- pkl.dump(self, f)
+ with open(self.get_pkl_filename(), "wb") as f:
+ try:
+ pkl.dump(self, f)
+ except Exception as e:
+ print(f"Error checkpointing model: {e}")
self.show_pickle_warnings_ = True
+ def get_pkl_filename(self) -> Path:
+ path = Path(self.output_directory_) / self.run_id_ / "checkpoint.pkl"
+ path.parent.mkdir(parents=True, exist_ok=True)
+ return path
+
@property
def equations(self): # pragma: no cover
warnings.warn(
@@ -1184,7 +1272,10 @@ def julia_options_(self):
@property
def julia_state_(self):
"""The deserialized state."""
- return jl_deserialize(self.julia_state_stream_)
+ return cast(
+ tuple[VectorValue, AnyValue] | None,
+ jl_deserialize(self.julia_state_stream_),
+ )
@property
def raw_julia_state_(self):
@@ -1196,7 +1287,13 @@ def raw_julia_state_(self):
)
return self.julia_state_
- def get_best(self, index=None) -> Union[pd.Series, List[pd.Series]]:
+ @property
+ def expression_spec_(self):
+ return self.expression_spec or ExpressionSpec()
+
+ def get_best(
+ self, index: int | list[int] | None = None
+ ) -> pd.Series | list[pd.Series]:
"""
Get best equation using `model_selection`.
@@ -1242,27 +1339,44 @@ def get_best(self, index=None) -> Union[pd.Series, List[pd.Series]]:
equations_.loc[idx_model_selection(equations_, self.model_selection)],
)
- def _setup_equation_file(self):
- """
- Set the full pathname of the equation file.
+ @property
+ def equation_file_(self):
+ raise NotImplementedError(
+ "PySRRegressor.equation_file_ is now deprecated. "
+ "Please use PySRRegressor.output_directory_ and PySRRegressor.run_id_ "
+ "instead. For loading, you should pass `run_directory`."
+ )
- This is performed using `tempdir` and
- `equation_file`.
- """
- # Cast tempdir string as a Path object
- self.tempdir_ = Path(tempfile.mkdtemp(dir=self.tempdir))
- if self.temp_equation_file:
- self.equation_file_ = self.tempdir_ / "hall_of_fame.csv"
- elif self.equation_file is None:
- if self.warm_start and (
- hasattr(self, "equation_file_") and self.equation_file_
- ):
- pass
- else:
- date_time = datetime.now().strftime("%Y-%m-%d_%H%M%S.%f")[:-3]
- self.equation_file_ = "hall_of_fame_" + date_time + ".csv"
+ def _setup_equation_file(self):
+ """Set the pathname of the output directory."""
+ if self.warm_start and (
+ hasattr(self, "run_id_") or hasattr(self, "output_directory_")
+ ):
+ assert hasattr(self, "output_directory_")
+ assert hasattr(self, "run_id_")
+ if self.run_id is not None:
+ assert self.run_id_ == self.run_id
+ if self.output_directory is not None:
+ assert self.output_directory_ == self.output_directory
else:
- self.equation_file_ = self.equation_file
+ self.output_directory_ = (
+ tempfile.mkdtemp()
+ if self.temp_equation_file
+ else (
+ "outputs"
+ if self.output_directory is None
+ else self.output_directory
+ )
+ )
+ self.run_id_ = (
+ cast(str, SymbolicRegression.SearchUtilsModule.generate_run_id())
+ if self.run_id is None
+ else self.run_id
+ )
+ if self.temp_equation_file:
+ assert self.output_directory is None
+
+ def _clear_equation_file_contents(self):
self.equation_file_contents_ = None
def _validate_and_modify_params(self) -> _DynamicallySetParams:
@@ -1294,24 +1408,6 @@ def _validate_and_modify_params(self) -> _DynamicallySetParams:
elif self.maxsize < 7:
raise ValueError("PySR requires a maxsize of at least 7")
- if self.deterministic and not (
- self.multithreading in [False, None]
- and self.procs == 0
- and self.random_state is not None
- ):
- raise ValueError(
- "To ensure deterministic searches, you must set `random_state` to a seed, "
- "`procs` to `0`, and `multithreading` to `False` or `None`."
- )
-
- if self.random_state is not None and (
- not self.deterministic or self.procs != 0
- ):
- warnings.warn(
- "Note: Setting `random_state` without also setting `deterministic` "
- "to True and `procs` to 0 will result in non-deterministic searches. "
- )
-
if self.elementwise_loss is not None and self.loss_function is not None:
raise ValueError(
"You cannot set both `elementwise_loss` and `loss_function`."
@@ -1328,7 +1424,6 @@ def _validate_and_modify_params(self) -> _DynamicallySetParams:
unary_operators=[],
maxdepth=self.maxsize,
constraints={},
- multithreading=self.procs != 0 and self.cluster_manager is None,
batch_size=1,
update_verbosity=int(self.verbosity),
progress=self.progress,
@@ -1364,15 +1459,15 @@ def _validate_and_set_fit_params(
complexity_of_variables,
X_units,
y_units,
- ) -> Tuple[
+ ) -> tuple[
ndarray,
ndarray,
- Optional[ndarray],
- Optional[ndarray],
+ ndarray | None,
+ ndarray | None,
ArrayLike[str],
- Union[int, float, List[Union[int, float]]],
- Optional[ArrayLike[str]],
- Optional[Union[str, ArrayLike[str]]],
+ int | float | list[int | float] | None,
+ ArrayLike[str] | None,
+ str | ArrayLike[str] | None,
]:
"""
Validate the parameters passed to the :term`fit` method.
@@ -1458,7 +1553,7 @@ def _validate_and_set_fit_params(
elif self.complexity_of_variables is not None:
complexity_of_variables = self.complexity_of_variables
else:
- complexity_of_variables = 1
+ complexity_of_variables = None
# Data validation and feature name fetching via sklearn
# This method sets the n_features_in_ attribute
@@ -1505,23 +1600,31 @@ def _validate_and_set_fit_params(
y_units,
)
- def _validate_data_X_y(self, X, y) -> Tuple[ndarray, ndarray]:
+ def _validate_data_X_y(self, X: Any, y: Any) -> tuple[ndarray, ndarray]:
raw_out = self._validate_data(X=X, y=y, reset=True, multi_output=True) # type: ignore
- return cast(Tuple[ndarray, ndarray], raw_out)
+ return cast(tuple[ndarray, ndarray], raw_out)
- def _validate_data_X(self, X) -> Tuple[ndarray]:
+ def _validate_data_X(self, X: Any) -> ndarray:
raw_out = self._validate_data(X=X, reset=False) # type: ignore
- return cast(Tuple[ndarray], raw_out)
+ return cast(ndarray, raw_out)
+
+ def _get_precision_mapped_dtype(self, X: np.ndarray) -> type:
+ is_complex = np.issubdtype(X.dtype, np.complexfloating)
+ is_real = not is_complex
+ if is_real:
+ return {16: np.float16, 32: np.float32, 64: np.float64}[self.precision]
+ else:
+ return {32: np.complex64, 64: np.complex128}[self.precision]
def _pre_transform_training_data(
self,
X: ndarray,
y: ndarray,
- Xresampled: Union[ndarray, None],
+ Xresampled: ndarray | None,
variable_names: ArrayLike[str],
- complexity_of_variables: Union[int, float, List[Union[int, float]]],
- X_units: Union[ArrayLike[str], None],
- y_units: Union[ArrayLike[str], str, None],
+ complexity_of_variables: int | float | list[int | float] | None,
+ X_units: ArrayLike[str] | None,
+ y_units: ArrayLike[str] | str | None,
random_state: np.random.RandomState,
):
"""
@@ -1542,7 +1645,7 @@ def _pre_transform_training_data(
variable_names : list[str]
Names of each variable in the training dataset, `X`.
Of length `n_features`.
- complexity_of_variables : int | float | list[int | float]
+ complexity_of_variables : int | float | list[int | float] | None
Complexity of each variable in the training dataset, `X`.
X_units : list[str]
Units of each variable in the training dataset, `X`.
@@ -1633,7 +1736,8 @@ def _run(
X: ndarray,
y: ndarray,
runtime_params: _DynamicallySetParams,
- weights: Optional[ndarray],
+ weights: ndarray | None,
+ category: ndarray | None,
seed: int,
):
"""
@@ -1652,6 +1756,10 @@ def _run(
Weight array of the same shape as `y`.
Each element is how to weight the mean-square-error loss
for that particular element of y.
+ category : ndarray | None
+ If `expression_spec` is a `ParametricExpressionSpec`, then this
+ argument should be a list of integers representing the category
+ of each sample in `X`.
seed : int
Random seed for julia backend process.
@@ -1673,13 +1781,7 @@ def _run(
# specified in init, so we define them here locally:
binary_operators = runtime_params.binary_operators
unary_operators = runtime_params.unary_operators
- maxdepth = runtime_params.maxdepth
constraints = runtime_params.constraints
- multithreading = runtime_params.multithreading
- batch_size = runtime_params.batch_size
- update_verbosity = runtime_params.update_verbosity
- progress = runtime_params.progress
- warmup_maxsize_by = runtime_params.warmup_maxsize_by
nested_constraints = self.nested_constraints
complexity_of_operators = self.complexity_of_operators
@@ -1687,10 +1789,31 @@ def _run(
cluster_manager = self.cluster_manager
# Start julia backend processes
- if not ALREADY_RAN and update_verbosity != 0:
+ if not ALREADY_RAN and runtime_params.update_verbosity != 0:
print("Compiling Julia backend...")
+ parallelism, numprocs = _map_parallelism_params(
+ self.parallelism, self.procs, getattr(self, "multithreading", None)
+ )
+
+ if self.deterministic and parallelism != "serial":
+ raise ValueError(
+ "To ensure deterministic searches, you must set `parallelism='serial'`. "
+ "Additionally, make sure to set `random_state` to a seed."
+ )
+ if self.random_state is not None and (
+ parallelism != "serial" or not self.deterministic
+ ):
+ warnings.warn(
+ "Note: Setting `random_state` without also setting `deterministic=True` "
+ "and `parallelism='serial'` will result in non-deterministic searches."
+ )
+
if cluster_manager is not None:
+ if parallelism != "multiprocessing":
+ raise ValueError(
+ "To use cluster managers, you must set `parallelism='multiprocessing'`."
+ )
cluster_manager = _load_cluster_manager(cluster_manager)
# TODO(mcranmer): These functions should be part of this class.
@@ -1699,14 +1822,17 @@ def _run(
unary_operators=unary_operators,
extra_sympy_mappings=self.extra_sympy_mappings,
)
- constraints = _process_constraints(
- binary_operators=binary_operators,
- unary_operators=unary_operators,
- constraints=constraints,
- )
-
- una_constraints = [constraints[op] for op in unary_operators]
- bin_constraints = [constraints[op] for op in binary_operators]
+ if constraints is not None:
+ _constraints = _process_constraints(
+ binary_operators=binary_operators,
+ unary_operators=unary_operators,
+ constraints=constraints,
+ )
+ una_constraints = [_constraints[op] for op in unary_operators]
+ bin_constraints = [_constraints[op] for op in binary_operators]
+ else:
+ una_constraints = None
+ bin_constraints = None
# Parse dict into Julia Dict for nested constraints::
if nested_constraints is not None:
@@ -1749,14 +1875,21 @@ def _run(
load_required_packages(
turbo=self.turbo,
bumper=self.bumper,
- enable_autodiff=self.enable_autodiff,
+ autodiff_backend=self.autodiff_backend,
cluster_manager=cluster_manager,
+ logger_spec=self.logger_spec,
)
+ if self.autodiff_backend is not None:
+ autodiff_backend = jl.Symbol(self.autodiff_backend)
+ else:
+ autodiff_backend = None
+
mutation_weights = SymbolicRegression.MutationWeights(
mutate_constant=self.weight_mutate_constant,
mutate_operator=self.weight_mutate_operator,
swap_operands=self.weight_swap_operands,
+ rotate_tree=self.weight_rotate_tree,
add_node=self.weight_add_node,
insert_node=self.weight_insert_node,
delete_node=self.weight_delete_node,
@@ -1766,8 +1899,8 @@ def _run(
optimize=self.weight_optimize,
)
- jl_binary_operators: List[Any] = []
- jl_unary_operators: List[Any] = []
+ jl_binary_operators: list[Any] = []
+ jl_unary_operators: list[Any] = []
for input_list, output_list, name in [
(binary_operators, jl_binary_operators, "binary"),
(unary_operators, jl_unary_operators, "unary"),
@@ -1780,6 +1913,12 @@ def _run(
)
output_list.append(jl_op)
+ complexity_mapping = (
+ jl.seval(self.complexity_mapping) if self.complexity_mapping else None
+ )
+
+ logger = self.logger_spec.create_logger() if self.logger_spec else None
+
# Call to Julia backend.
# See https://github.com/MilesCranmer/SymbolicRegression.jl/blob/master/src/OptionsStruct.jl
options = SymbolicRegression.Options(
@@ -1790,14 +1929,19 @@ def _run(
complexity_of_operators=complexity_of_operators,
complexity_of_constants=self.complexity_of_constants,
complexity_of_variables=complexity_of_variables,
+ complexity_mapping=complexity_mapping,
+ expression_type=self.expression_spec_.julia_expression_type(),
+ expression_options=self.expression_spec_.julia_expression_options(),
nested_constraints=nested_constraints,
elementwise_loss=custom_loss,
loss_function=custom_full_objective,
maxsize=int(self.maxsize),
- output_file=_escape_filename(self.equation_file_),
+ output_directory=_escape_filename(self.output_directory_),
npopulations=int(self.populations),
batching=self.batching,
- batch_size=int(min([batch_size, len(X)]) if self.batching else len(X)),
+ batch_size=int(
+ min([runtime_params.batch_size, len(X)]) if self.batching else len(X)
+ ),
mutation_weights=mutation_weights,
tournament_selection_p=self.tournament_selection_p,
tournament_selection_n=self.tournament_selection_n,
@@ -1806,17 +1950,17 @@ def _run(
dimensional_constraint_penalty=self.dimensional_constraint_penalty,
dimensionless_constants_only=self.dimensionless_constants_only,
alpha=self.alpha,
- maxdepth=maxdepth,
+ maxdepth=runtime_params.maxdepth,
fast_cycle=self.fast_cycle,
turbo=self.turbo,
bumper=self.bumper,
- enable_autodiff=self.enable_autodiff,
+ autodiff_backend=autodiff_backend,
migration=self.migration,
hof_migration=self.hof_migration,
fraction_replaced_hof=self.fraction_replaced_hof,
should_simplify=self.should_simplify,
should_optimize_constants=self.should_optimize_constants,
- warmup_maxsize_by=warmup_maxsize_by,
+ warmup_maxsize_by=runtime_params.warmup_maxsize_by,
use_frequency=self.use_frequency,
use_frequency_in_tournament=self.use_frequency_in_tournament,
adaptive_parsimony_scaling=self.adaptive_parsimony_scaling,
@@ -1827,9 +1971,11 @@ def _run(
print_precision=self.print_precision,
optimizer_algorithm=self.optimizer_algorithm,
optimizer_nrestarts=self.optimizer_nrestarts,
+ optimizer_f_calls_limit=self.optimizer_f_calls_limit,
optimizer_probability=self.optimize_probability,
optimizer_iterations=self.optimizer_iterations,
perturbation_factor=self.perturbation_factor,
+ probability_negate_constant=self.probability_negate_constant,
annealing=self.annealing,
timeout_in_seconds=self.timeout_in_seconds,
crossover_probability=self.crossover_probability,
@@ -1845,12 +1991,7 @@ def _run(
# Convert data to desired precision
test_X = np.array(X)
- is_complex = np.issubdtype(test_X.dtype, np.complexfloating)
- is_real = not is_complex
- if is_real:
- np_dtype = {16: np.float16, 32: np.float32, 64: np.float64}[self.precision]
- else:
- np_dtype = {32: np.complex64, 64: np.complex128}[self.precision]
+ np_dtype = self._get_precision_mapped_dtype(test_X)
# This converts the data into a Julia array:
jl_X = jl_array(np.array(X, dtype=np_dtype).T)
@@ -1866,16 +2007,14 @@ def _run(
else:
jl_weights = None
- if self.procs == 0 and not multithreading:
- parallelism = "serial"
- elif multithreading:
- parallelism = "multithreading"
+ if category is not None:
+ offset_for_julia_indexing = 1
+ jl_category = jl_array(
+ (category + offset_for_julia_indexing).astype(np.int64)
+ )
+ jl_extra = jl.seval("NamedTuple{(:class,)}")((jl_category,))
else:
- parallelism = "multiprocessing"
-
- cprocs = (
- None if parallelism in ["serial", "multithreading"] else int(self.procs)
- )
+ jl_extra = jl.NamedTuple()
if len(y.shape) > 1:
# We set these manually so that they respect Python's 0 indexing
@@ -1886,11 +2025,11 @@ def _run(
else:
jl_y_variable_names = None
- PythonCall.GC.disable()
out = SymbolicRegression.equation_search(
jl_X,
jl_y,
weights=jl_weights,
+ extra=jl_extra,
niterations=int(self.niterations),
variable_names=jl_array([str(v) for v in self.feature_names_in_]),
display_variable_names=jl_array(
@@ -1904,24 +2043,26 @@ def _run(
else self.y_units_
),
options=options,
- numprocs=cprocs,
+ numprocs=numprocs,
parallelism=parallelism,
saved_state=self.julia_state_,
return_state=True,
+ run_id=self.run_id_,
addprocs_function=cluster_manager,
heap_size_hint_in_bytes=self.heap_size_hint_in_bytes,
- progress=progress and self.verbosity > 0 and len(y.shape) == 1,
+ progress=runtime_params.progress
+ and self.verbosity > 0
+ and len(y.shape) == 1,
verbosity=int(self.verbosity),
+ logger=logger,
)
- PythonCall.GC.enable()
+ if self.logger_spec is not None:
+ self.logger_spec.write_hparams(logger, self.get_params())
self.julia_state_stream_ = jl_serialize(out)
# Set attributes
- self.equations_ = self.get_hof()
-
- if self.delete_tempfiles:
- shutil.rmtree(self.tempdir_)
+ self.equations_ = self.get_hof(out)
ALREADY_RAN = True
@@ -1931,14 +2072,14 @@ def fit(
self,
X,
y,
+ *,
Xresampled=None,
weights=None,
- variable_names: Optional[ArrayLike[str]] = None,
- complexity_of_variables: Optional[
- Union[int, float, List[Union[int, float]]]
- ] = None,
- X_units: Optional[ArrayLike[str]] = None,
- y_units: Optional[Union[str, ArrayLike[str]]] = None,
+ variable_names: ArrayLike[str] | None = None,
+ complexity_of_variables: int | float | list[int | float] | None = None,
+ X_units: ArrayLike[str] | None = None,
+ y_units: str | ArrayLike[str] | None = None,
+ category: ndarray | None = None,
) -> "PySRRegressor":
"""
Search for equations to fit the dataset and store them in `self.equations_`.
@@ -1975,6 +2116,10 @@ def fit(
Similar to `X_units`, but as a unit for the target variable, `y`.
If `y` is a matrix, a list of units should be passed. If `X_units`
is given but `y_units` is not, then `y_units` will be arbitrary.
+ category : list[int]
+ If `expression_spec` is a `ParametricExpressionSpec`, then this
+ argument should be a list of integers representing the category
+ of each sample.
Returns
-------
@@ -2002,9 +2147,17 @@ def fit(
self.y_units_ = None
self._setup_equation_file()
+ self._clear_equation_file_contents()
runtime_params = self._validate_and_modify_params()
+ if category is not None:
+ assert Xresampled is None
+
+ if isinstance(self.expression_spec, ParametricExpressionSpec):
+ assert category is not None
+
+ # TODO: Put `category` here
(
X,
y,
@@ -2028,7 +2181,7 @@ def fit(
if X.shape[0] > 10000 and not self.batching:
warnings.warn(
"Note: you are running with more than 10,000 datapoints. "
- "You should consider turning on batching (https://astroautomata.com/PySR/options/#batching). "
+ "You should consider turning on batching (https://ai.damtp.cam.ac.uk/pysr/options/#batching). "
"You should also reconsider if you need that many datapoints. "
"Unless you have a large amount of noise (in which case you "
"should smooth your dataset first), generally < 10,000 datapoints "
@@ -2084,7 +2237,7 @@ def fit(
self._checkpoint()
# Perform the search:
- self._run(X, y, runtime_params, weights=weights, seed=seed)
+ self._run(X, y, runtime_params, weights=weights, seed=seed, category=category)
# Then, after fit, we save again, so the pickle file contains
# the equations:
@@ -2093,7 +2246,7 @@ def fit(
return self
- def refresh(self, checkpoint_file: Optional[PathLike] = None) -> None:
+ def refresh(self, run_directory: PathLike | None = None) -> None:
"""
Update self.equations_ with any new options passed.
@@ -2106,13 +2259,20 @@ def refresh(self, checkpoint_file: Optional[PathLike] = None) -> None:
Path to checkpoint hall of fame file to be loaded.
The default will use the set `equation_file_`.
"""
- if checkpoint_file is not None:
- self.equation_file_ = checkpoint_file
- self.equation_file_contents_ = None
- check_is_fitted(self, attributes=["equation_file_"])
+ if run_directory is not None:
+ self.output_directory_ = str(Path(run_directory).parent)
+ self.run_id_ = Path(run_directory).name
+ self._clear_equation_file_contents()
+ check_is_fitted(self, attributes=["run_id_", "output_directory_"])
self.equations_ = self.get_hof()
- def predict(self, X, index=None):
+ def predict(
+ self,
+ X,
+ index: int | list[int] | None = None,
+ *,
+ category: ndarray | None = None,
+ ) -> ndarray:
"""
Predict y from input X using the equation chosen by `model_selection`.
@@ -2128,6 +2288,10 @@ def predict(self, X, index=None):
particular row of `self.equations_`, you may specify the index here.
For multiple output equations, you must pass a list of indices
in the same order.
+ category : ndarray | None
+ If `expression_spec` is a `ParametricExpressionSpec`, then this
+ argument should be a list of integers representing the category
+ of each sample in `X`.
Returns
-------
@@ -2169,15 +2333,30 @@ def predict(self, X, index=None):
# feature selected) X in fit.
X = X.reindex(columns=self.feature_names_in_)
X = self._validate_data_X(X)
+ if self.expression_spec_.evaluates_in_julia:
+ # Julia wants the right dtype
+ X = X.astype(self._get_precision_mapped_dtype(X))
+
+ if category is not None:
+ offset_for_julia_indexing = 1
+ args: tuple = (
+ jl_array((category + offset_for_julia_indexing).astype(np.int64)),
+ )
+ else:
+ args = ()
try:
if isinstance(best_equation, list):
assert self.nout_ > 1
return np.stack(
- [eq["lambda_format"](X) for eq in best_equation], axis=1
+ [
+ cast(ndarray, eq["lambda_format"](X, *args))
+ for eq in best_equation
+ ],
+ axis=1,
)
else:
- return best_equation["lambda_format"](X)
+ return cast(ndarray, best_equation["lambda_format"](X, *args))
except Exception as error:
raise ValueError(
"Failed to evaluate the expression. "
@@ -2187,7 +2366,7 @@ def predict(self, X, index=None):
"You can then run `model.refresh()` to re-load the expressions."
) from error
- def sympy(self, index=None):
+ def sympy(self, index: int | list[int] | None = None):
"""
Return sympy representation of the equation(s) chosen by `model_selection`.
@@ -2205,6 +2384,10 @@ def sympy(self, index=None):
best_equation : str, list[str] of length nout_
SymPy representation of the best equation.
"""
+ if not self.expression_spec_.supports_sympy:
+ raise ValueError(
+ f"`expression_spec={self.expression_spec_}` does not support sympy export."
+ )
self.refresh()
best_equation = self.get_best(index=index)
if isinstance(best_equation, list):
@@ -2213,7 +2396,9 @@ def sympy(self, index=None):
else:
return best_equation["sympy_format"]
- def latex(self, index=None, precision=3):
+ def latex(
+ self, index: int | list[int] | None = None, precision: int = 3
+ ) -> str | list[str]:
"""
Return latex representation of the equation(s) chosen by `model_selection`.
@@ -2235,6 +2420,10 @@ def latex(self, index=None, precision=3):
best_equation : str or list[str] of length nout_
LaTeX expression of the best equation.
"""
+ if not self.expression_spec_.supports_latex:
+ raise ValueError(
+ f"`expression_spec={self.expression_spec_}` does not support latex export."
+ )
self.refresh()
sympy_representation = self.sympy(index=index)
if self.nout_ > 1:
@@ -2268,6 +2457,10 @@ def jax(self, index=None):
Dictionary of callable jax function in "callable" key,
and jax array of parameters as "parameters" key.
"""
+ if not self.expression_spec_.supports_jax:
+ raise ValueError(
+ f"`expression_spec={self.expression_spec_}` does not support jax export."
+ )
self.set_params(output_jax_format=True)
self.refresh()
best_equation = self.get_best(index=index)
@@ -2300,6 +2493,10 @@ def pytorch(self, index=None):
best_equation : torch.nn.Module
PyTorch module representing the expression.
"""
+ if not self.expression_spec_.supports_torch:
+ raise ValueError(
+ f"`expression_spec={self.expression_spec_}` does not support torch export."
+ )
self.set_params(output_torch_format=True)
self.refresh()
best_equation = self.get_best(index=index)
@@ -2308,27 +2505,35 @@ def pytorch(self, index=None):
else:
return best_equation["torch_format"]
- def _read_equation_file(self):
+ def get_equation_file(self, i: int | None = None) -> Path:
+ if i is not None:
+ return (
+ Path(self.output_directory_)
+ / self.run_id_
+ / f"hall_of_fame_output{i}.csv"
+ )
+ else:
+ return Path(self.output_directory_) / self.run_id_ / "hall_of_fame.csv"
+
+ def _read_equation_file(self) -> list[pd.DataFrame]:
"""Read the hall of fame file created by `SymbolicRegression.jl`."""
try:
if self.nout_ > 1:
all_outputs = []
for i in range(1, self.nout_ + 1):
- cur_filename = str(self.equation_file_) + f".out{i}" + ".bkup"
+ cur_filename = str(self.get_equation_file(i)) + ".bak"
if not os.path.exists(cur_filename):
- cur_filename = str(self.equation_file_) + f".out{i}"
+ cur_filename = str(self.get_equation_file(i))
with open(cur_filename, "r", encoding="utf-8") as f:
buf = f.read()
buf = _preprocess_julia_floats(buf)
-
df = self._postprocess_dataframe(pd.read_csv(StringIO(buf)))
-
all_outputs.append(df)
else:
- filename = str(self.equation_file_) + ".bkup"
+ filename = str(self.get_equation_file()) + ".bak"
if not os.path.exists(filename):
- filename = str(self.equation_file_)
+ filename = str(self.get_equation_file())
with open(filename, "r", encoding="utf-8") as f:
buf = f.read()
buf = _preprocess_julia_floats(buf)
@@ -2352,8 +2557,8 @@ def _postprocess_dataframe(self, df: pd.DataFrame) -> pd.DataFrame:
return df
- def get_hof(self):
- """Get the equations from a hall of fame file.
+ def get_hof(self, search_output=None) -> pd.DataFrame | list[pd.DataFrame]:
+ """Get the equations from a hall of fame file or search output.
If no arguments entered, the ones used
previously from a call to PySR will be used.
@@ -2362,127 +2567,34 @@ def get_hof(self):
self,
attributes=[
"nout_",
- "equation_file_",
+ "run_id_",
+ "output_directory_",
"selection_mask_",
"feature_names_in_",
],
)
- if (
+ should_read_from_file = (
not hasattr(self, "equation_file_contents_")
- ) or self.equation_file_contents_ is None:
+ or self.equation_file_contents_ is None
+ )
+ if should_read_from_file:
self.equation_file_contents_ = self._read_equation_file()
- # It is expected extra_jax/torch_mappings will be updated after fit.
- # Thus, validation is performed here instead of in _validate_init_params
- extra_jax_mappings = self.extra_jax_mappings
- extra_torch_mappings = self.extra_torch_mappings
- if extra_jax_mappings is not None:
- for value in extra_jax_mappings.values():
- if not isinstance(value, str):
- raise ValueError(
- "extra_jax_mappings must have keys that are strings! "
- "e.g., {sympy.sqrt: 'jnp.sqrt'}."
- )
- else:
- extra_jax_mappings = {}
- if extra_torch_mappings is not None:
- for value in extra_torch_mappings.values():
- if not callable(value):
- raise ValueError(
- "extra_torch_mappings must be callable functions! "
- "e.g., {sympy.sqrt: torch.sqrt}."
- )
- else:
- extra_torch_mappings = {}
-
- ret_outputs = []
-
- equation_file_contents = copy.deepcopy(self.equation_file_contents_)
-
- for output in equation_file_contents:
- scores = []
- lastMSE = None
- lastComplexity = 0
- sympy_format = []
- lambda_format = []
- jax_format = []
- torch_format = []
-
- for _, eqn_row in output.iterrows():
- eqn = pysr2sympy(
- eqn_row["equation"],
- feature_names_in=self.feature_names_in_,
- extra_sympy_mappings=self.extra_sympy_mappings,
- )
- sympy_format.append(eqn)
-
- # NumPy:
- sympy_symbols = create_sympy_symbols(self.feature_names_in_)
- lambda_format.append(
- sympy2numpy(
- eqn,
- sympy_symbols,
- selection=self.selection_mask_,
- )
- )
+ _validate_export_mappings(self.extra_jax_mappings, self.extra_torch_mappings)
- # JAX:
- if self.output_jax_format:
- func, params = sympy2jax(
- eqn,
- sympy_symbols,
- selection=self.selection_mask_,
- extra_jax_mappings=self.extra_jax_mappings,
- )
- jax_format.append({"callable": func, "parameters": params})
-
- # Torch:
- if self.output_torch_format:
- module = sympy2torch(
- eqn,
- sympy_symbols,
- selection=self.selection_mask_,
- extra_torch_mappings=self.extra_torch_mappings,
- )
- torch_format.append(module)
-
- curMSE = eqn_row["loss"]
- curComplexity = eqn_row["complexity"]
-
- if lastMSE is None:
- cur_score = 0.0
- else:
- if curMSE > 0.0:
- # TODO Move this to more obvious function/file.
- cur_score = -np.log(curMSE / lastMSE) / (
- curComplexity - lastComplexity
- )
- else:
- cur_score = np.inf
-
- scores.append(cur_score)
- lastMSE = curMSE
- lastComplexity = curComplexity
-
- output["score"] = np.array(scores)
- output["sympy_format"] = sympy_format
- output["lambda_format"] = lambda_format
- output_cols = [
- "complexity",
- "loss",
- "score",
- "equation",
- "sympy_format",
- "lambda_format",
- ]
- if self.output_jax_format:
- output_cols += ["jax_format"]
- output["jax_format"] = jax_format
- if self.output_torch_format:
- output_cols += ["torch_format"]
- output["torch_format"] = torch_format
+ equation_file_contents = cast(list[pd.DataFrame], self.equation_file_contents_)
- ret_outputs.append(output[output_cols])
+ ret_outputs = [
+ pd.concat(
+ [
+ output,
+ calculate_scores(output),
+ self.expression_spec_.create_exports(self, output, search_output),
+ ],
+ axis=1,
+ )
+ for output in equation_file_contents
+ ]
if self.nout_ > 1:
return ret_outputs
@@ -2490,10 +2602,10 @@ def get_hof(self):
def latex_table(
self,
- indices=None,
- precision=3,
- columns=["equation", "complexity", "loss", "score"],
- ):
+ indices: list[int] | None = None,
+ precision: int = 3,
+ columns: list[str] = ["equation", "complexity", "loss", "score"],
+ ) -> str:
"""Create a LaTeX/booktabs table for all, or some, of the equations.
Parameters
@@ -2516,6 +2628,10 @@ def latex_table(
latex_table_str : str
A string that will render a table in LaTeX of the equations.
"""
+ if not self.expression_spec_.supports_latex:
+ raise ValueError(
+ f"`expression_spec={self.expression_spec_}` does not support latex export."
+ )
self.refresh()
if isinstance(self.equations_, list):
@@ -2561,12 +2677,41 @@ def idx_model_selection(equations: pd.DataFrame, model_selection: str):
return chosen_idx
-def _mutate_parameter(param_name: str, param_value):
- if param_name in ["binary_operators", "unary_operators"] and isinstance(
- param_value, str
- ):
- return [param_value]
+def calculate_scores(df: pd.DataFrame) -> pd.DataFrame:
+ """Calculate scores for each equation based on loss and complexity.
+
+ Score is defined as the negated derivative of the log-loss with respect to complexity.
+ A higher score means the equation achieved a much better loss at a slightly higher complexity.
+ """
+ scores = []
+ lastMSE = None
+ lastComplexity = 0
+ for _, row in df.iterrows():
+ curMSE = row["loss"]
+ curComplexity = row["complexity"]
+
+ if lastMSE is None:
+ cur_score = 0.0
+ else:
+ if curMSE > 0.0:
+ cur_score = -np.log(curMSE / lastMSE) / (curComplexity - lastComplexity)
+ else:
+ cur_score = np.inf
+
+ scores.append(cur_score)
+ lastMSE = curMSE
+ lastComplexity = curComplexity
+
+ return pd.DataFrame(
+ {
+ "score": np.array(scores),
+ },
+ index=df.index,
+ )
+
+
+def _mutate_parameter(param_name: str, param_value):
if param_name == "batch_size" and param_value < 1:
warnings.warn(
"Given `batch_size` must be greater than or equal to one. "
@@ -2586,3 +2731,88 @@ def _mutate_parameter(param_name: str, param_value):
return False
return param_value
+
+
+def _map_parallelism_params(
+ parallelism: Literal["serial", "multithreading", "multiprocessing"] | None,
+ procs: int | None,
+ multithreading: bool | None,
+) -> tuple[Literal["serial", "multithreading", "multiprocessing"], int | None]:
+ """Map old and new parallelism parameters to the new format.
+
+ Parameters
+ ----------
+ parallelism : str or None
+ New parallelism parameter. Can be "serial", "multithreading", or "multiprocessing".
+ procs : int or None
+ Number of processes parameter.
+ multithreading : bool or None
+ Old multithreading parameter.
+
+ Returns
+ -------
+ parallelism : str
+ Mapped parallelism mode.
+ procs : int or None
+ Mapped number of processes.
+
+ Raises
+ ------
+ ValueError
+ If both old and new parameters are specified, or if invalid combinations are given.
+ """
+ # Check for mixing old and new parameters
+ using_new = parallelism is not None
+ using_old = multithreading is not None
+
+ if using_new and using_old:
+ raise ValueError(
+ "Cannot mix old and new parallelism parameters. "
+ "Use either `parallelism` and `numprocs`, or `procs` and `multithreading`."
+ )
+ elif using_old:
+ warnings.warn(
+ "The `multithreading: bool` parameter has been deprecated in favor "
+ "of `parallelism: Literal['multithreading', 'serial', 'multiprocessing']`.\n"
+ "Previous usage of `multithreading=True` (default) is now `parallelism='multithreading'`; "
+ "`multithreading=False, procs=0` is now `parallelism='serial'`; and "
+ "`multithreading=True, procs={int}` is now `parallelism='multiprocessing', procs={int}`."
+ )
+ if multithreading:
+ _parallelism: Literal["multithreading", "multiprocessing", "serial"] = (
+ "multithreading"
+ )
+ _procs = None
+ elif procs is not None and procs > 0:
+ _parallelism = "multiprocessing"
+ _procs = procs
+ else:
+ _parallelism = "serial"
+ _procs = None
+ elif using_new:
+ _parallelism = cast(
+ Literal["serial", "multithreading", "multiprocessing"], parallelism
+ )
+ _procs = procs
+ else:
+ _parallelism = "multithreading"
+ _procs = None
+
+ if _parallelism not in {"serial", "multithreading", "multiprocessing"}:
+ raise ValueError(
+ "`parallelism` must be one of 'serial', 'multithreading', or 'multiprocessing'"
+ )
+ elif _parallelism == "serial" and _procs is not None:
+ warnings.warn(
+ "`numprocs` is specified but will be ignored since `parallelism='serial'`"
+ )
+ _procs = None
+ elif parallelism == "multithreading" and _procs is not None:
+ warnings.warn(
+ "`numprocs` is specified but will be ignored since `parallelism='multithreading'`"
+ )
+ _procs = None
+ elif parallelism == "multiprocessing" and _procs is None:
+ _procs = cpu_count()
+
+ return _parallelism, _procs
diff --git a/pysr/test/__init__.py b/pysr/test/__init__.py
index cb6b9e4a3..4d977cccf 100644
--- a/pysr/test/__init__.py
+++ b/pysr/test/__init__.py
@@ -1,7 +1,7 @@
-from .test import runtests
from .test_cli import get_runtests as get_runtests_cli
from .test_dev import runtests as runtests_dev
from .test_jax import runtests as runtests_jax
+from .test_main import runtests
from .test_startup import runtests as runtests_startup
from .test_torch import runtests as runtests_torch
diff --git a/pysr/test/params.py b/pysr/test/params.py
index 54da4ac7d..10de46c99 100644
--- a/pysr/test/params.py
+++ b/pysr/test/params.py
@@ -1,4 +1,6 @@
import inspect
+import os
+import unittest
from pysr import PySRRegressor
@@ -6,3 +8,8 @@
DEFAULT_NITERATIONS = DEFAULT_PARAMS["niterations"].default
DEFAULT_POPULATIONS = DEFAULT_PARAMS["populations"].default
DEFAULT_NCYCLES = DEFAULT_PARAMS["ncycles_per_iteration"].default
+
+skip_if_beartype = unittest.skipIf(
+ os.environ.get("PYSR_USE_BEARTYPE", "0") == "1",
+ "Skipping because beartype would fail test",
+)
diff --git a/pysr/test/test_dev.py b/pysr/test/test_dev.py
index b8a2b4645..a0f4f7f5d 100644
--- a/pysr/test/test_dev.py
+++ b/pysr/test/test_dev.py
@@ -7,8 +7,8 @@
class TestDev(unittest.TestCase):
def test_simple_change_to_backend(self):
"""Test that we can use a development version of SymbolicRegression.jl"""
- PYSR_TEST_JULIA_VERSION = os.environ.get("PYSR_TEST_JULIA_VERSION", "1.6")
- PYSR_TEST_PYTHON_VERSION = os.environ.get("PYSR_TEST_PYTHON_VERSION", "3.9")
+ PYSR_TEST_JULIA_VERSION = os.environ.get("PYSR_TEST_JULIA_VERSION", "1.11")
+ PYSR_TEST_PYTHON_VERSION = os.environ.get("PYSR_TEST_PYTHON_VERSION", "3.12")
build_result = subprocess.run(
[
"docker",
diff --git a/pysr/test/test_dev_pysr.dockerfile b/pysr/test/test_dev_pysr.dockerfile
index 2978e82b7..fcde426b9 100644
--- a/pysr/test/test_dev_pysr.dockerfile
+++ b/pysr/test/test_dev_pysr.dockerfile
@@ -2,8 +2,8 @@
# tries to manually edit SymbolicRegression.jl and
# use it from PySR.
-ARG JLVERSION=1.9.4
-ARG PYVERSION=3.11.6
+ARG JLVERSION=1.11.1
+ARG PYVERSION=3.12.6
ARG BASE_IMAGE=bullseye
FROM julia:${JLVERSION}-${BASE_IMAGE} AS jl
@@ -15,10 +15,6 @@ ENV PATH="/usr/local/julia/bin:${PATH}"
WORKDIR /pysr
-# Caches install (https://stackoverflow.com/questions/25305788/how-to-avoid-reinstalling-packages-when-building-docker-image-for-python-project)
-ADD ./requirements.txt /pysr/requirements.txt
-RUN pip3 install --no-cache-dir -r /pysr/requirements.txt
-
# Install PySR:
# We do a minimal copy so it doesn't need to rerun at every file change:
ADD ./pyproject.toml /pysr/pyproject.toml
diff --git a/pysr/test/test_jax.py b/pysr/test/test_jax.py
index e0237c829..d261afcf5 100644
--- a/pysr/test/test_jax.py
+++ b/pysr/test/test_jax.py
@@ -1,5 +1,6 @@
import unittest
from functools import partial
+from pathlib import Path
import numpy as np
import pandas as pd
@@ -46,11 +47,12 @@ def test_pipeline_pandas(self):
}
)
- equations["Complexity Loss Equation".split(" ")].to_csv(
- "equation_file.csv.bkup"
- )
+ for fname in ["hall_of_fame.csv.bak", "hall_of_fame.csv"]:
+ equations["Complexity Loss Equation".split(" ")].to_csv(
+ Path(model.output_directory_) / model.run_id_ / fname
+ )
- model.refresh(checkpoint_file="equation_file.csv")
+ model.refresh(run_directory=str(Path(model.output_directory_) / model.run_id_))
jformat = model.jax()
np.testing.assert_almost_equal(
@@ -73,11 +75,12 @@ def test_pipeline(self):
}
)
- equations["Complexity Loss Equation".split(" ")].to_csv(
- "equation_file.csv.bkup"
- )
+ for fname in ["hall_of_fame.csv.bak", "hall_of_fame.csv"]:
+ equations["Complexity Loss Equation".split(" ")].to_csv(
+ Path(model.output_directory_) / model.run_id_ / fname
+ )
- model.refresh(checkpoint_file="equation_file.csv")
+ model.refresh(run_directory=str(Path(model.output_directory_) / model.run_id_))
jformat = model.jax()
np.testing.assert_almost_equal(
@@ -121,6 +124,8 @@ def test_feature_selection_custom_operators(self):
def cos_approx(x):
return 1 - (x**2) / 2 + (x**4) / 24 + (x**6) / 720
+ sp_cos_approx = sympy.Function("cos_approx")
+
y = X["k15"] ** 2 + 2 * cos_approx(X["k20"])
model = PySRRegressor(
@@ -129,26 +134,19 @@ def cos_approx(x):
select_k_features=3,
maxsize=10,
early_stop_condition=1e-5,
- extra_sympy_mappings={"cos_approx": cos_approx},
+ extra_sympy_mappings={"cos_approx": sp_cos_approx},
extra_jax_mappings={
- "cos_approx": "(lambda x: 1 - x**2 / 2 + x**4 / 24 + x**6 / 720)"
+ sp_cos_approx: "(lambda x: 1 - x**2 / 2 + x**4 / 24 + x**6 / 720)"
},
random_state=0,
deterministic=True,
- procs=0,
- multithreading=False,
+ parallelism="serial",
)
np.random.seed(0)
model.fit(X.values, y.values)
f, parameters = model.jax().values()
-
- np_prediction = model.predict
jax_prediction = partial(f, parameters=parameters)
-
- np_output = np_prediction(X.values)
jax_output = jax_prediction(X.values)
-
- np.testing.assert_almost_equal(y.values, np_output, decimal=3)
np.testing.assert_almost_equal(y.values, jax_output, decimal=3)
diff --git a/pysr/test/test.py b/pysr/test/test_main.py
similarity index 83%
rename from pysr/test/test.py
rename to pysr/test/test_main.py
index c641e9f66..3264383ad 100644
--- a/pysr/test/test.py
+++ b/pysr/test/test_main.py
@@ -1,6 +1,7 @@
import importlib
import os
import pickle as pkl
+import platform
import tempfile
import traceback
import unittest
@@ -12,7 +13,15 @@
import sympy # type: ignore
from sklearn.utils.estimator_checks import check_estimator
-from pysr import PySRRegressor, install, jl, load_all_packages
+from pysr import (
+ ParametricExpressionSpec,
+ PySRRegressor,
+ TemplateExpressionSpec,
+ TensorBoardLoggerSpec,
+ install,
+ jl,
+ load_all_packages,
+)
from pysr.export_latex import sympy2latex
from pysr.feature_selection import _handle_feature_selection, run_feature_selection
from pysr.julia_helpers import init_julia
@@ -22,13 +31,13 @@
_suggest_keywords,
idx_model_selection,
)
-from pysr.utils import _csv_filename_to_pkl_filename
from .params import (
DEFAULT_NCYCLES,
DEFAULT_NITERATIONS,
DEFAULT_PARAMS,
DEFAULT_POPULATIONS,
+ skip_if_beartype,
)
# Disables local saving:
@@ -94,7 +103,7 @@ def test_multiprocessing_turbo_custom_objective(self):
# Turbo needs to work with unsafe operators:
unary_operators=["sqrt"],
procs=2,
- multithreading=False,
+ parallelism="multiprocessing",
turbo=True,
early_stop_condition="stop_if(loss, complexity) = loss < 1e-10 && complexity == 1",
loss_function="""
@@ -348,9 +357,8 @@ def test_warm_start_set_at_init(self):
def test_noisy_builtin_variable_names(self):
y = self.X[:, [0, 1]] ** 2 + self.rstate.randn(self.X.shape[0], 1) * 0.05
model = PySRRegressor(
- # Test that passing a single operator works:
- unary_operators="sq(x) = x^2",
- binary_operators="plus",
+ unary_operators=["sq(x) = x^2"],
+ binary_operators=["plus"],
extra_sympy_mappings={"sq": lambda x: x**2},
**self.default_test_kwargs,
procs=0,
@@ -447,16 +455,17 @@ def test_load_model(self):
csv_file_data = "\n".join([line.strip() for line in csv_file_data.split("\n")])
for from_backup in [False, True]:
- rand_dir = Path(tempfile.mkdtemp())
- equation_filename = str(rand_dir / "equation.csv")
- with open(equation_filename + (".bkup" if from_backup else ""), "w") as f:
+ output_directory = Path(tempfile.mkdtemp())
+ equation_filename = str(output_directory / "hall_of_fame.csv")
+ with open(equation_filename + (".bak" if from_backup else ""), "w") as f:
f.write(csv_file_data)
model = PySRRegressor.from_file(
- equation_filename,
+ run_directory=output_directory,
n_features_in=5,
feature_names_in=["f0", "f1", "f2", "f3", "f4"],
binary_operators=["+", "*", "/", "-", "^"],
unary_operators=["cos"],
+ precision=64,
)
X = self.rstate.rand(100, 5)
y_truth = 2.2683423 ** np.cos(X[:, 3])
@@ -468,9 +477,8 @@ def test_load_model_simple(self):
# Test that we can simply load a model from its equation file.
y = self.X[:, [0, 1]] ** 2
model = PySRRegressor(
- # Test that passing a single operator works:
- unary_operators="sq(x) = x^2",
- binary_operators="plus",
+ unary_operators=["sq(x) = x^2"],
+ binary_operators=["plus"],
extra_sympy_mappings={"sq": lambda x: x**2},
**self.default_test_kwargs,
procs=0,
@@ -478,27 +486,28 @@ def test_load_model_simple(self):
early_stop_condition="stop_if(loss, complexity) = loss < 0.05 && complexity == 2",
)
rand_dir = Path(tempfile.mkdtemp())
- equation_file = rand_dir / "equations.csv"
+ equation_file = rand_dir / "1" / "hall_of_fame.csv"
model.set_params(temp_equation_file=False)
- model.set_params(equation_file=equation_file)
+ model.set_params(output_directory=rand_dir)
+ model.set_params(run_id="1")
model.fit(self.X, y)
# lambda functions are removed from the pickling, so we need
# to pass it during the loading:
model2 = PySRRegressor.from_file(
- model.equation_file_, extra_sympy_mappings={"sq": lambda x: x**2}
+ run_directory=rand_dir / "1", extra_sympy_mappings={"sq": lambda x: x**2}
)
np.testing.assert_allclose(model.predict(self.X), model2.predict(self.X))
# Try again, but using only the pickle file:
- for file_to_delete in [str(equation_file), str(equation_file) + ".bkup"]:
+ for file_to_delete in [str(equation_file), str(equation_file) + ".bak"]:
if os.path.exists(file_to_delete):
os.remove(file_to_delete)
# pickle_file = rand_dir / "equations.pkl"
model3 = PySRRegressor.from_file(
- model.equation_file_, extra_sympy_mappings={"sq": lambda x: x**2}
+ run_directory=rand_dir / "1", extra_sympy_mappings={"sq": lambda x: x**2}
)
np.testing.assert_allclose(model.predict(self.X), model3.predict(self.X))
@@ -512,38 +521,208 @@ def test_jl_function_error(self):
str(cm.exception),
)
+ def test_template_expressions_and_custom_complexity(self):
+ # Create random data between -1 and 1
+ X = self.rstate.uniform(-1, 1, (100, 2))
+
+ # Ground truth: sin(x + y)
+ y = np.sin(X[:, 0] + X[:, 1])
+
+ # Create model with template that includes the missing sin operator
+ model = PySRRegressor(
+ expression_spec=TemplateExpressionSpec(
+ ["f"], "sin_of_f((; f), (x, y)) = sin(f(x, y))"
+ ),
+ binary_operators=["+", "-", "*", "/"],
+ unary_operators=[], # No sin operator!
+ maxsize=10,
+ early_stop_condition="stop_if(loss, complexity) = loss < 1e-10 && complexity == 6",
+ # Custom complexity *function*:
+ complexity_mapping="my_complexity(ex) = sum(t -> 2, get_tree(ex))",
+ **self.default_test_kwargs,
+ )
+
+ model.fit(X, y)
+
+ # Test on out of domain data - this should still work due to sin template!
+ X_test = self.rstate.uniform(2, 10, (25, 2))
+ y_test = np.sin(X_test[:, 0] + X_test[:, 1])
+ y_pred = model.predict(X_test)
+
+ test_mse = np.mean((y_test - y_pred) ** 2)
+ self.assertLess(test_mse, 1e-5)
+
+ # Check there is a row with complexity 6 and MSE < 1e-10
+ df = model.equations_
+ good_rows = df[(df.complexity == 6) & (df.loss < 1e-10)]
+ self.assertGreater(len(good_rows), 0)
+
+ # Check there are NO rows with lower complexity and MSE < 1e-10
+ simpler_good_rows = df[(df.complexity < 6) & (df.loss < 1e-10)]
+ self.assertEqual(len(simpler_good_rows), 0)
+
+ # Make sure that a nice error is raised if we try to get the sympy expression:
+ # f"`expression_spec={self.expression_spec_}` does not support sympy export."
+ with self.assertRaises(ValueError) as cm:
+ model.sympy()
+ self.assertRegex(
+ str(cm.exception),
+ r"`expression_spec=.*TemplateExpressionSpec.*` does not support sympy export.",
+ )
+ with self.assertRaises(ValueError):
+ model.latex()
+ with self.assertRaises(ValueError):
+ model.jax()
+ with self.assertRaises(ValueError):
+ model.pytorch()
+ with self.assertRaises(ValueError):
+ model.latex_table()
+
+ def test_parametric_expression(self):
+ # Create data with two classes
+ n_points = 100
+ X = self.rstate.uniform(-3, 3, (n_points, 2)) # x1, x2
+ category = self.rstate.randint(0, 3, n_points) # class (0 or 1)
+
+ # True parameters for each class
+ P1 = [0.1, 1.5, -5.2] # phase shift for each class
+ P2 = [3.2, 0.5, 1.2] # offset for each class
+
+ # Ground truth: 2*cos(x2 + P1[class]) + x1^2 - P2[class]
+ y = np.array(
+ [
+ 2 * np.cos(x2 + P1[c]) + x1**2 - P2[c]
+ for x1, x2, c in zip(X[:, 0], X[:, 1], category)
+ ]
+ )
+
+ model = PySRRegressor(
+ expression_spec=ParametricExpressionSpec(max_parameters=2),
+ binary_operators=["+", "*", "/", "-"],
+ unary_operators=["cos", "exp"],
+ maxsize=20,
+ early_stop_condition="stop_if(loss, complexity) = loss < 1e-4 && complexity <= 14",
+ **self.default_test_kwargs,
+ )
+
+ model.fit(X, y, category=category)
+
+ # Test on new data points
+ X_test = self.rstate.uniform(-6, 6, (10, 2))
+ category_test = self.rstate.randint(0, 3, 10)
+
+ y_test = np.array(
+ [
+ 2 * np.cos(x2 + P1[c]) + x1**2 - P2[c]
+ for x1, x2, c in zip(X_test[:, 0], X_test[:, 1], category_test)
+ ]
+ )
+
+ y_test_pred = model.predict(X_test, category=category_test)
+ test_mse = np.mean((y_test - y_test_pred) ** 2)
+ self.assertLess(test_mse, 1e-3)
+
+ with self.assertRaises(ValueError):
+ model.sympy()
+ with self.assertRaises(ValueError):
+ model.latex()
+ with self.assertRaises(ValueError):
+ model.jax()
+ with self.assertRaises(ValueError):
+ model.pytorch()
+ with self.assertRaises(ValueError):
+ model.latex_table()
+
+ def test_tensorboard_logger(self):
+
+ if platform.system() == "Windows":
+ self.skipTest("Skipping test on Windows")
+
+ """Test TensorBoard logger functionality."""
+ try:
+ from tensorboard.backend.event_processing.event_accumulator import ( # type: ignore
+ EventAccumulator,
+ )
+ except ImportError:
+ self.skipTest("TensorBoard not installed. Skipping test.")
+
+ y = self.X[:, 0]
+ with tempfile.TemporaryDirectory() as tmpdir:
+ logger_spec = TensorBoardLoggerSpec(
+ log_dir=tmpdir, log_interval=2, overwrite=True
+ )
+ model = PySRRegressor(
+ **self.default_test_kwargs,
+ logger_spec=logger_spec,
+ early_stop_condition="stop_if(loss, complexity) = loss < 1e-4 && complexity == 1",
+ )
+ model.fit(self.X, y)
+
+ # Verify log directory exists and contains TensorBoard files
+ log_dir = Path(tmpdir)
+ assert log_dir.exists()
+ files = list(log_dir.glob("events.out.tfevents.*"))
+ assert len(files) == 1
+
+ # Load and verify TensorBoard events
+ event_acc = EventAccumulator(str(log_dir))
+ event_acc.Reload()
+
+ # Check that we have the expected scalar summaries
+ scalars = event_acc.Tags()["scalars"]
+ self.assertIn("search/data/summaries/pareto_volume", scalars)
+ self.assertIn("search/data/summaries/min_loss", scalars)
+
+ # Check that we have multiple events for each summary
+ pareto_events = event_acc.Scalars("search/data/summaries/pareto_volume")
+ min_loss_events = event_acc.Scalars("search/data/summaries/min_loss")
+
+ self.assertGreater(len(pareto_events), 0)
+ self.assertGreater(len(min_loss_events), 0)
+
+ # Verify model still works as expected
+ self.assertLessEqual(model.get_best()["loss"], 1e-4)
+
def manually_create_model(equations, feature_names=None):
if feature_names is None:
feature_names = ["x0", "x1"]
+ output_directory = tempfile.mkdtemp()
+ run_id = "test"
model = PySRRegressor(
progress=False,
niterations=1,
extra_sympy_mappings={},
output_jax_format=False,
model_selection="accuracy",
- equation_file="equation_file.csv",
+ output_directory=output_directory,
+ run_id=run_id,
)
+ model.output_directory_ = output_directory
+ model.run_id_ = run_id
+ os.makedirs(Path(output_directory) / run_id, exist_ok=True)
# Set up internal parameters as if it had been fitted:
if isinstance(equations, list):
# Multi-output.
- model.equation_file_ = "equation_file.csv"
model.nout_ = len(equations)
model.selection_mask_ = None
model.feature_names_in_ = np.array(feature_names, dtype=object)
for i in range(model.nout_):
equations[i]["complexity loss equation".split(" ")].to_csv(
- f"equation_file.csv.out{i+1}.bkup"
+ str(
+ Path(output_directory)
+ / run_id
+ / f"hall_of_fame_output{i+1}.csv.bak"
+ )
)
else:
- model.equation_file_ = "equation_file.csv"
model.nout_ = 1
model.selection_mask_ = None
model.feature_names_in_ = np.array(feature_names, dtype=object)
equations["complexity loss equation".split(" ")].to_csv(
- "equation_file.csv.bkup"
+ str(Path(output_directory) / run_id / "hall_of_fame.csv.bak")
)
model.refresh()
@@ -630,27 +809,12 @@ def test_feature_selection_handler(self):
class TestMiscellaneous(unittest.TestCase):
"""Test miscellaneous functions."""
- def test_csv_to_pkl_conversion(self):
- """Test that csv filename to pkl filename works as expected."""
- tmpdir = Path(tempfile.mkdtemp())
- equation_file = tmpdir / "equations.389479384.28378374.csv"
- expected_pkl_file = tmpdir / "equations.389479384.28378374.pkl"
-
- # First, test inputting the paths:
- test_pkl_file = _csv_filename_to_pkl_filename(equation_file)
- self.assertEqual(test_pkl_file, str(expected_pkl_file))
-
- # Next, test inputting the strings.
- test_pkl_file = _csv_filename_to_pkl_filename(str(equation_file))
- self.assertEqual(test_pkl_file, str(expected_pkl_file))
-
def test_pickle_with_temp_equation_file(self):
"""If we have a temporary equation file, unpickle the estimator."""
model = PySRRegressor(
populations=int(1 + DEFAULT_POPULATIONS / 5),
temp_equation_file=True,
- procs=0,
- multithreading=False,
+ parallelism="serial",
)
nout = 3
X = np.random.randn(100, 2)
@@ -660,9 +824,15 @@ def test_pickle_with_temp_equation_file(self):
y_predictions = model.predict(X)
- equation_file_base = model.equation_file_
+ equation_file_base = Path("outputs") / model.run_id_ / "hall_of_fame"
+ for i in range(1, nout + 1):
+ assert not os.path.exists(str(equation_file_base) + f"_output{i}.csv.bak")
+
+ equation_file_base = (
+ Path(model.output_directory_) / model.run_id_ / "hall_of_fame"
+ )
for i in range(1, nout + 1):
- assert not os.path.exists(str(equation_file_base) + f".out{i}.bkup")
+ assert os.path.exists(str(equation_file_base) + f"_output{i}.csv.bak")
with tempfile.NamedTemporaryFile() as pickle_file:
pkl.dump(model, pickle_file)
@@ -687,8 +857,7 @@ def test_scikit_learn_compatibility(self):
progress=False,
random_state=0,
deterministic=True, # Deterministic as tests require this.
- procs=0,
- multithreading=False,
+ parallelism="serial",
warm_start=False,
temp_equation_file=True,
) # Return early.
@@ -748,6 +917,7 @@ def test_load_all_packages(self):
class TestHelpMessages(unittest.TestCase):
"""Test user help messages."""
+ @skip_if_beartype
def test_deprecation(self):
"""Ensure that deprecation works as expected.
@@ -760,6 +930,14 @@ def test_deprecation(self):
# The correct value should be set:
self.assertEqual(model.fraction_replaced, 0.2)
+ with self.assertRaises(NotImplementedError):
+ model.equation_file_
+
+ with self.assertRaises(ValueError) as cm:
+ PySRRegressor.from_file(equation_file="", run_directory="")
+
+ self.assertIn("Passing `equation_file` is deprecated", str(cm.exception))
+
def test_deprecated_functions(self):
with self.assertWarns(FutureWarning):
install()
@@ -807,7 +985,7 @@ def test_deterministic_warnings(self):
warnings.simplefilter("error")
with self.assertRaises(Exception) as context:
model.fit(X, y)
- self.assertIn("`deterministic`", str(context.exception))
+ self.assertIn("`deterministic=True`", str(context.exception))
def test_deterministic_errors(self):
"""Setting deterministic without random_state should error"""
@@ -846,6 +1024,7 @@ def test_bad_variable_names_fail(self):
model.fit(X, y, variable_names=["f{c}"])
self.assertIn("Invalid variable name", str(cm.exception))
+ @skip_if_beartype
def test_bad_kwargs(self):
bad_kwargs = [
dict(
@@ -1207,8 +1386,8 @@ def test_unit_propagation(self):
"""
X = np.ones((100, 3))
y = np.ones((100, 1))
- temp_dir = Path(tempfile.mkdtemp())
- equation_file = str(temp_dir / "equation_file.csv")
+ output_dir = tempfile.mkdtemp()
+ run_id = "test"
model = PySRRegressor(
binary_operators=["+", "*"],
early_stop_condition="(l, c) -> l < 1e-6 && c == 3",
@@ -1219,11 +1398,11 @@ def test_unit_propagation(self):
complexity_of_constants=10,
weight_mutate_constant=0.0,
should_optimize_constants=False,
- multithreading=False,
+ parallelism="serial",
deterministic=True,
- procs=0,
random_state=0,
- equation_file=equation_file,
+ output_directory=output_dir,
+ run_id=run_id,
warm_start=True,
)
model.fit(
@@ -1243,16 +1422,18 @@ def test_unit_propagation(self):
)
# With pkl file:
- pkl_file = str(temp_dir / "equation_file.pkl")
- model2 = PySRRegressor.from_file(pkl_file)
+ run_directory = str(Path(output_dir) / run_id)
+ model2 = PySRRegressor.from_file(run_directory=run_directory)
best2 = model2.get_best()
self.assertIn("x0", best2["equation"])
# From csv file alone (we need to delete pkl file:)
# First, we delete the pkl file:
- os.remove(pkl_file)
+ os.remove(Path(run_directory) / "checkpoint.pkl")
model3 = PySRRegressor.from_file(
- equation_file, binary_operators=["+", "*"], n_features_in=X.shape[1]
+ run_directory=run_directory,
+ binary_operators=["+", "*"],
+ n_features_in=X.shape[1],
)
best3 = model3.get_best()
self.assertIn("x0", best3["equation"])
diff --git a/pysr/test/test_nb.ipynb b/pysr/test/test_nb.ipynb
index 1cd394fff..ea0084ad7 100644
--- a/pysr/test/test_nb.ipynb
+++ b/pysr/test/test_nb.ipynb
@@ -122,7 +122,7 @@
"X = np.random.randn(10, 2)\n",
"y = np.random.randn(10)\n",
"\n",
- "model = PySRRegressor(deterministic=True, multithreading=False, procs=0, random_state=0, verbosity=0, progress=False, niterations=1, ncycles_per_iteration=1)\n",
+ "model = PySRRegressor(deterministic=True, parallelism=\"serial\", random_state=0, verbosity=0, progress=False, niterations=1, ncycles_per_iteration=1)\n",
"str(model)"
]
},
diff --git a/pysr/test/test_startup.py b/pysr/test/test_startup.py
index 6ad64b624..a92010cee 100644
--- a/pysr/test/test_startup.py
+++ b/pysr/test/test_startup.py
@@ -43,7 +43,8 @@ def test_warm_start_from_file(self):
)
model.warm_start = True
model.temp_equation_file = False
- model.equation_file = Path(tmpdirname) / "equations.csv"
+ model.output_directory = tmpdirname
+ model.run_id = "test"
model.deterministic = True
model.multithreading = False
model.random_state = 0
@@ -76,7 +77,9 @@ def test_warm_start_from_file(self):
y = np.load("{y_file}")
print("Loading model from file")
- model = PySRRegressor.from_file("{model.equation_file}")
+ model = PySRRegressor.from_file(
+ run_directory="{str(Path(tmpdirname) / model.run_id_)}"
+ )
assert model.julia_state_ is not None
@@ -130,8 +133,6 @@ def test_bad_startup_options(self):
self.assertIn(warning_test["msg"], result.stderr.decode())
def test_notebook(self):
- if jl_version < (1, 9, 0):
- self.skipTest("Julia version too old")
if platform.system() == "Windows":
self.skipTest("Notebook test incompatible with Windows")
if not os.access(Path(__file__).parent, os.W_OK):
diff --git a/pysr/test/test_torch.py b/pysr/test/test_torch.py
index 8b26f5ca6..256d21f86 100644
--- a/pysr/test/test_torch.py
+++ b/pysr/test/test_torch.py
@@ -1,4 +1,5 @@
import unittest
+from pathlib import Path
import numpy as np
import pandas as pd
@@ -48,11 +49,12 @@ def test_pipeline_pandas(self):
}
)
- equations["Complexity Loss Equation".split(" ")].to_csv(
- "equation_file.csv.bkup"
- )
+ for fname in ["hall_of_fame.csv.bak", "hall_of_fame.csv"]:
+ equations["Complexity Loss Equation".split(" ")].to_csv(
+ Path(model.output_directory_) / model.run_id_ / fname
+ )
- model.refresh(checkpoint_file="equation_file.csv")
+ model.refresh(run_directory=str(Path(model.output_directory_) / model.run_id_))
tformat = model.pytorch()
self.assertEqual(str(tformat), "_SingleSymPyModule(expression=cos(x1)**2)")
@@ -81,11 +83,12 @@ def test_pipeline(self):
}
)
- equations["Complexity Loss Equation".split(" ")].to_csv(
- "equation_file.csv.bkup"
- )
+ for fname in ["hall_of_fame.csv.bak", "hall_of_fame.csv"]:
+ equations["Complexity Loss Equation".split(" ")].to_csv(
+ Path(model.output_directory_) / model.run_id_ / fname
+ )
- model.refresh(checkpoint_file="equation_file.csv")
+ model.refresh(run_directory=str(Path(model.output_directory_) / model.run_id_))
tformat = model.pytorch()
self.assertEqual(str(tformat), "_SingleSymPyModule(expression=cos(x1)**2)")
@@ -133,21 +136,26 @@ def test_custom_operator(self):
}
)
- equations["Complexity Loss Equation".split(" ")].to_csv(
- "equation_file_custom_operator.csv.bkup"
- )
+ for fname in ["hall_of_fame.csv.bak", "hall_of_fame.csv"]:
+ equations["Complexity Loss Equation".split(" ")].to_csv(
+ Path(model.output_directory_) / model.run_id_ / fname
+ )
+
+ MyCustomOperator = sympy.Function("mycustomoperator")
model.set_params(
- equation_file="equation_file_custom_operator.csv",
- extra_sympy_mappings={"mycustomoperator": sympy.sin},
- extra_torch_mappings={"mycustomoperator": self.torch.sin},
+ extra_sympy_mappings={"mycustomoperator": MyCustomOperator},
+ extra_torch_mappings={MyCustomOperator: self.torch.sin},
)
- model.refresh(checkpoint_file="equation_file_custom_operator.csv")
- self.assertEqual(str(model.sympy()), "sin(x1)")
+ # TODO: We shouldn't need to specify the run directory here.
+ model.refresh(run_directory=str(Path(model.output_directory_) / model.run_id_))
+ # self.assertEqual(str(model.sympy()), "sin(x1)")
# Will automatically use the set global state from get_hof.
tformat = model.pytorch()
- self.assertEqual(str(tformat), "_SingleSymPyModule(expression=sin(x1))")
+ self.assertEqual(
+ str(tformat), "_SingleSymPyModule(expression=mycustomoperator(x1))"
+ )
np.testing.assert_almost_equal(
tformat(self.torch.tensor(X)).detach().numpy(),
np.sin(X[:, 1]),
@@ -200,11 +208,9 @@ def cos_approx(x):
maxsize=10,
early_stop_condition=1e-5,
extra_sympy_mappings={"cos_approx": cos_approx},
- extra_torch_mappings={"cos_approx": cos_approx},
random_state=0,
deterministic=True,
- procs=0,
- multithreading=False,
+ parallelism="serial",
)
np.random.seed(0)
model.fit(X.values, y.values)
diff --git a/pysr/utils.py b/pysr/utils.py
index de7faf16e..e18a5b4c3 100644
--- a/pysr/utils.py
+++ b/pysr/utils.py
@@ -1,42 +1,35 @@
+from __future__ import annotations
+
import difflib
import inspect
-import os
import re
from pathlib import Path
-from typing import Any, List, TypeVar, Union
+from typing import Any, TypeVar
from numpy import ndarray
from sklearn.utils.validation import _check_feature_names_in # type: ignore
T = TypeVar("T", bound=Any)
-ArrayLike = Union[ndarray, List[T]]
-PathLike = Union[str, Path]
-
+ArrayLike = ndarray | list[T]
+PathLike = str | Path
-def _csv_filename_to_pkl_filename(csv_filename: PathLike) -> PathLike:
- if os.path.splitext(csv_filename)[1] == ".pkl":
- return csv_filename
- # Assume that the csv filename is of the form "foo.csv"
- assert str(csv_filename).endswith(".csv")
+_regexp_im = re.compile(r"\b(\d+\.\d+)im\b")
+_regexp_im_sci = re.compile(r"\b(\d+\.\d+)[eEfF]([+-]?\d+)im\b")
+_regexp_sci = re.compile(r"\b(\d+\.\d+)[eEfF]([+-]?\d+)\b")
- dirname = str(os.path.dirname(csv_filename))
- basename = str(os.path.basename(csv_filename))
- base = str(os.path.splitext(basename)[0])
- pkl_basename = base + ".pkl"
+def _apply_regexp_im(x: str):
+ return _regexp_im.sub(r"\1j", x)
- return os.path.join(dirname, pkl_basename)
+def _apply_regexp_im_sci(x: str):
+ return _regexp_im_sci.sub(r"\1e\2j", x)
-_regexp_im = re.compile(r"\b(\d+\.\d+)im\b")
-_regexp_im_sci = re.compile(r"\b(\d+\.\d+)[eEfF]([+-]?\d+)im\b")
-_regexp_sci = re.compile(r"\b(\d+\.\d+)[eEfF]([+-]?\d+)\b")
-_apply_regexp_im = lambda x: _regexp_im.sub(r"\1j", x)
-_apply_regexp_im_sci = lambda x: _regexp_im_sci.sub(r"\1e\2j", x)
-_apply_regexp_sci = lambda x: _regexp_sci.sub(r"\1e\2", x)
+def _apply_regexp_sci(x: str):
+ return _regexp_sci.sub(r"\1e\2", x)
def _preprocess_julia_floats(s: str) -> str:
@@ -65,7 +58,7 @@ def _subscriptify(i: int) -> str:
return "".join([chr(0x2080 + int(c)) for c in str(i)])
-def _suggest_keywords(cls, k: str) -> List[str]:
+def _suggest_keywords(cls, k: str) -> list[str]:
valid_keywords = [
param
for param in inspect.signature(cls.__init__).parameters
diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index aa92aaf13..000000000
--- a/requirements.txt
+++ /dev/null
@@ -1,7 +0,0 @@
-sympy>=1.0.0,<2.0.0
-pandas>=0.21.0,<3.0.0
-numpy>=1.13.0,<3.0.0
-scikit_learn>=1.0.0,<2.0.0
-juliacall==0.9.23
-click>=7.0.0,<9.0.0
-setuptools>=50.0.0