diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml new file mode 100644 index 0000000..4537ba7 --- /dev/null +++ b/.github/workflows/deploy.yml @@ -0,0 +1,48 @@ +# This file was created automatically with `myst init --gh-pages` 🪄 💚 + +name: MyST GitHub Pages Deploy +on: + push: + # Runs on pushes targeting the default branch + branches: [main] +# env: + # `BASE_URL` determines the website is served from, including CSS & JS assets + # You may need to change this to `BASE_URL: ''` + # BASE_URL: /${{ github.event.repository.name }} + +# Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages +permissions: + contents: read + pages: write + id-token: write +# Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued. +# However, do NOT cancel in-progress runs as we want to allow these production deployments to complete. +concurrency: + group: 'pages' + cancel-in-progress: false +jobs: + deploy: + environment: + name: github-pages + url: ${{ steps.deployment.outputs.page_url }} + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Setup Pages + uses: actions/configure-pages@v3 + - name: Setup Miniconda + uses: conda-incubator/setup-miniconda@v3 + with: + auto-activate-base: false + environment-file: environment.yml + python-version: 3.11 + activate-environment: rlbook + - name: Build HTML Assets + run: myst build --html --execute --ci + - name: Upload artifact + uses: actions/upload-pages-artifact@v1 + with: + path: './_build/html' + - name: Deploy to GitHub Pages + id: deployment + uses: actions/deploy-pages@v2 diff --git a/.gitignore b/.gitignore index d0cfe25..658b7b2 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ _build/ .DS_Store .ipynb_checkpoints/ +__pycache__/ # exercise solutions solutions/ @@ -25,3 +26,6 @@ conf.py *.run.xml *.synctex.gz *.toc + +# MyST build outputs +_build diff --git a/Makefile b/Makefile index c2664a7..7c0f860 100644 --- a/Makefile +++ b/Makefile @@ -2,11 +2,10 @@ ENV_NAME = rlbook RUN = micromamba run -n $(ENV_NAME) -_NOTEBOOKS = $(addprefix book/, bandits contextual_bandits control exploration fitted_dp imitation_learning mdps pg planning supervised_learning) +_NOTEBOOKS = $(addprefix book/, bandits control exploration fitted_dp imitation_learning mdps pg planning supervised_learning) _META = \ background \ - bibliography \ index NOTEBOOKS = $(addsuffix .md, $(_NOTEBOOKS)) @@ -22,7 +21,7 @@ SOURCE = $(NOTEBOOKS) $(META) $(SOLUTIONS) CONFIG = book/_config.yml book/_toc.yml book/_build/html: $(SOURCE) $(CONFIG) - $(RUN) jb build book + $(RUN) jb build -W -n --keep-going book open: book/_build/html open book/_build/html/index.html diff --git a/appendix/appendix.pdf b/appendix/appendix.pdf deleted file mode 100644 index 932e891..0000000 Binary files a/appendix/appendix.pdf and /dev/null differ diff --git a/appendix/appendix.tex b/appendix/appendix.tex deleted file mode 100644 index 05a908b..0000000 --- a/appendix/appendix.tex +++ /dev/null @@ -1,18 +0,0 @@ -\providecommand{\main}{..} - -\documentclass[\main/main]{subfiles} - -\begin{document} - -\chapter{Derivations} - -\section{Natural policy gradient} - -The TRPO objective is -\[ - \max_\theta \E_{s_0, \dots, s_{H-1} \sim \rho_{\theta^k}} -\] -\todo{finish derivation from 2023-11-08 lecture} - -\end{document} - diff --git a/assets/boston_dynamics.jpg b/assets/boston_dynamics.jpg deleted file mode 100644 index d52f0ae..0000000 Binary files a/assets/boston_dynamics.jpg and /dev/null differ diff --git a/assets/cart_pole.png b/assets/cart_pole.png deleted file mode 100644 index f5e2763..0000000 Binary files a/assets/cart_pole.png and /dev/null differ diff --git a/assets/log_taylor.png b/assets/log_taylor.png deleted file mode 100644 index d61a4fd..0000000 Binary files a/assets/log_taylor.png and /dev/null differ diff --git a/assets/rubiks_cube.jpg b/assets/rubiks_cube.jpg deleted file mode 100644 index 913278e..0000000 Binary files a/assets/rubiks_cube.jpg and /dev/null differ diff --git a/book/_toc.yml b/book/_toc.yml index 018f641..814e610 100644 --- a/book/_toc.yml +++ b/book/_toc.yml @@ -15,6 +15,6 @@ chapters: - file: imitation_learning.md - file: planning.md - file: exploration.md - - file: contextual_bandits.md - - file: bibliography.md + # - file: contextual_bandits.md + # - file: bibliography.md - file: background.md diff --git a/book/background.md b/book/background.md index 5b691d1..dcce8cc 100644 --- a/book/background.md +++ b/book/background.md @@ -1,4 +1,16 @@ -(background)= +--- +jupytext: + text_representation: + extension: .md + format_name: myst + format_version: 0.13 + jupytext_version: 1.16.2 +kernelspec: + display_name: Python 3 (ipykernel) + language: python + name: python3 +--- + # Appendix: Background ## O notation diff --git a/book/bandits.md b/book/bandits.md index 331bf0e..bb549e0 100644 --- a/book/bandits.md +++ b/book/bandits.md @@ -11,16 +11,39 @@ kernelspec: name: python3 --- -(bandits)= # Multi-Armed Bandits -```{code-cell} ipython3 -:tags: [hide-input] +## Introduction + +The **multi-armed bandits** (MAB) setting is a simple setting for studying the basic challenges of sequential decision-making. +In this setting, an agent repeatedly chooses from a fixed set of actions, called **arms**, each of which has an associated reward distribution. The agent’s goal is to maximize the total reward it receives over some time period. + + + +In particular, we’ll spend a lot of time discussing the **Exploration-Exploitation Tradeoff**: should the agent choose new actions to learn more about the environment, or should it choose actions that it already knows to be good? + +::::{prf:example} Online advertising +:label: advertising + +Let’s suppose you, the agent, are an advertising company. You have $K$ different ads that you can show to users; For concreteness, let’s suppose there’s just a single user. You receive $1$ reward if the user clicks the ad, and $0$ otherwise. Thus, the unknown *reward distribution* associated to each ad is a Bernoulli distribution defined by the probability that the user clicks on the ad. Your goal is to maximize the total number of clicks by the user. +:::: + +::::{prf:example} Clinical trials +:label: clinical_trials + +Suppose you’re a pharmaceutical company, and you’re testing a new drug. You have $K$ different dosages of the drug that you can administer to patients. You receive $1$ reward if the patient recovers, and $0$ otherwise. Thus, the unknown *reward distribution* associated to each dosage is a Bernoulli distribution defined by the probability that the patient recovers. Your goal is to maximize the total number of patients that recover. +:::: + +In this chapter, we will introduce the multi-armed bandits setting, and discuss some of the challenges that arise when trying to solve problems in this setting. We will also introduce some of the key concepts that we will use throughout the book, such as regret and exploration-exploitation tradeoffs. + +```{code-cell} ipython3 from jaxtyping import Float, Array import numpy as np - -# from bokeh.plotting import figure, show, output_notebook import latexify from typing import Callable, Union import matplotlib.pyplot as plt @@ -29,11 +52,6 @@ import solutions.bandits as solutions np.random.seed(184) -# output_notebook() # set up bokeh - -plt.style.use("fivethirtyeight") - - def random_argmax(ary: Array) -> int: """Take an argmax and randomize between ties.""" max_idx = np.flatnonzero(ary == ary.max()) @@ -49,32 +67,8 @@ latex = latexify.algorithmic( ) ``` -The **multi-armed bandits** (MAB) setting is a simple setting for studying the basic challenges of RL. In this setting, an agent repeatedly chooses from a fixed set of actions, called **arms**, each of which has an associated reward distribution. The agent’s goal is to maximize the total reward it receives over some time period. - -| States | Actions | Rewards | -| :----: | :-----: | :---------------------------------: | -| None | Finite | $\mathcal{A} \to \triangle([0, 1])$ | - -In particular, we’ll spend a lot of time discussing the **Exploration-Exploitation Tradeoff**: should the agent choose new actions to learn more about the environment, or should it choose actions that it already knows to be good? - -::::{prf:example} Online advertising -:label: advertising - -Let’s suppose you, the agent, are an advertising company. You have $K$ different ads that you can show to users; For concreteness, let’s suppose there’s just a single user. You receive $1$ reward if the user clicks the ad, and $0$ otherwise. Thus, the unknown *reward distribution* associated to each ad is a Bernoulli distribution defined by the probability that the user clicks on the ad. Your goal is to maximize the total number of clicks by the user. -:::: - -::::{prf:example} Clinical trials -:label: clinical_trials - -Suppose you’re a pharmaceutical company, and you’re testing a new drug. You have $K$ different dosages of the drug that you can administer to patients. You receive $1$ reward if the patient recovers, and $0$ otherwise. Thus, the unknown *reward distribution* associated to each dosage is a Bernoulli distribution defined by the probability that the patient recovers. Your goal is to maximize the total number of patients that recover. -:::: - -In this chapter, we will introduce the multi-armed bandits setting, and discuss some of the challenges that arise when trying to solve problems in this setting. We will also introduce some of the key concepts that we will use throughout the book, such as regret and exploration-exploitation tradeoffs. - +++ -## Introduction - ::::{prf:remark} Namesake :label: multi-armed @@ -188,7 +182,7 @@ algorithms in two different senses: $\E[\text{Regret}_T] \le M_T$. 2. Find a *high-probability* upper bound on the regret, i.e. show - $\P(\text{Regret}_T \le M_{T, \delta}) \ge 1-\delta$. + $\pr(\text{Regret}_T \le M_{T, \delta}) \ge 1-\delta$. Note that these two different approaches say very different things about the regret. The first approach says that the *average* regret is at most $M_T$. However, the agent might still achieve higher regret on many runs. The second approach says that, *with high probability*, the agent will achieve regret at most $M_{T, \delta}$. However, it doesn’t say anything about the regret in the remaining $\delta$ fraction of runs, which might be arbitrarily high. @@ -289,7 +283,7 @@ expected regret is simply: $$ \begin{aligned} - \E[\text{Regret}_T] &= \P(r^0 < r^1) \cdot T(\mu^0 - \mu^1) + c \\ + \E[\text{Regret}_T] &= \pr(r^0 < r^1) \cdot T(\mu^0 - \mu^1) + c \\ &= (1 - \mu^0) \mu^1 \cdot T(\mu^0 - \mu^1) + c \end{aligned} $$ @@ -368,7 +362,7 @@ Let $X_0, \dots, X_{n-1}$ be i.i.d. random variables with $X_i \in [0, 1]$ almost surely for each $i \in [n]$. Then for any $\delta > 0$, -$$\P\left( \left| \frac{1}{n} \sum_{i=1}^n (X_i - \E[X_i]) \right| > \sqrt{\frac{\ln(2/\delta)}{2n}} \right) \le \delta.$$ +$$\pr\left( \left| \frac{1}{n} \sum_{i=1}^n (X_i - \E[X_i]) \right| > \sqrt{\frac{\ln(2/\delta)}{2n}} \right) \le \delta.$$ ::: The proof of this inequality is beyond the scope of this book. See {cite}`vershynin_high-dimensional_2018` Chapter 2.2. @@ -376,9 +370,9 @@ The proof of this inequality is beyond the scope of this book. See {cite}`vershy We can apply this directly to the rewards for a given arm $k$, since the rewards from that arm are i.i.d.: :::{math} -:label: "hoeffding-etc" +:label: hoeffding-etc -\P\left(|\Delta^k | > \sqrt{\frac{\ln(2/\delta)}{2N_{\text{explore}}}} \right) \le \delta. +\pr\left(|\Delta^k | > \sqrt{\frac{\ln(2/\delta)}{2N_{\text{explore}}}} \right) \le \delta. ::: But note that we can’t apply this to arm $\hat k$ directly since @@ -394,12 +388,12 @@ The **union bound** provides a simple way to do this: Consider a set of events $A_0, \dots, A_{n-1}$. Then -$$\P(\exists i \in [n]. A_i) \le \sum_{i=0}^{n-1} \P(A_i).$$ +$$\pr(\exists i \in [n]. A_i) \le \sum_{i=0}^{n-1} \pr(A_i).$$ In -particular, if $\P(A_i) \ge 1 - \delta$ for each $i \in [n]$, we have +particular, if $\pr(A_i) \ge 1 - \delta$ for each $i \in [n]$, we have -$$\P(\forall i \in [n]. A_i) \ge 1 - n \delta.$$ +$$\pr(\forall i \in [n]. A_i) \ge 1 - n \delta.$$ ::: **Exercise:** Prove the second statement above. @@ -408,7 +402,7 @@ Applying the union bound across the arms for the l.h.s. event of {eq}`hoeffding- $$ \begin{aligned} - \P\left( \forall k \in [K], |\Delta^k | \le \sqrt{\frac{\ln(2/\delta)}{2N_{\text{explore}}}} \right) &\ge 1-K\delta + \pr\left( \forall k \in [K], |\Delta^k | \le \sqrt{\frac{\ln(2/\delta)}{2N_{\text{explore}}}} \right) &\ge 1-K\delta \end{aligned} $$ @@ -514,7 +508,7 @@ upper confidence bound $M^k_t$ such that $\hat \mu^k_t \le M^k_t$ with high probability, and then choose $a_t := \arg \max_{k \in [K]} M^k_t$. But how should we compute $M^k_t$? -In [](etc-regret-analysis), we were able to compute this bound +In [](#etc-regret-analysis), we were able to compute this bound using Hoeffding’s inequality, which assumes that the number of samples is *fixed*. This was the case in ETC (where we pull each arm $N_{\text{explore}}$ times), but in UCB, the number of times we pull @@ -554,7 +548,7 @@ $N^k_t$: $$ \begin{aligned} - \P\left( \forall n \le t, |\tilde \mu^k_n - \mu^k | \le \sqrt{\frac{\ln(2/\delta)}{2n}} \right) &\ge 1-t\delta. + \pr\left( \forall n \le t, |\tilde \mu^k_n - \mu^k | \le \sqrt{\frac{\ln(2/\delta)}{2n}} \right) &\ge 1-t\delta. \end{aligned} $$ @@ -562,7 +556,7 @@ In particular, since $N^k_t \le t$, and $\tilde \mu^k_{N^k_t} = \hat \mu^k_t$ by $$ \begin{aligned} - \P\left( |\hat \mu^k_t - \mu^k | \le \sqrt{\frac{\ln(2t/\delta')}{2N^k_t}} \right) &\ge 1-\delta' \text{ where } \delta' := t \delta. + \pr\left( |\hat \mu^k_t - \mu^k | \le \sqrt{\frac{\ln(2t/\delta')}{2N^k_t}} \right) &\ge 1-\delta' \text{ where } \delta' := t \delta. \end{aligned} $$ @@ -616,7 +610,7 @@ yourself for practice). $$ \begin{aligned} - \P\left(\forall k \le K, t < T. |\hat \mu^k_t - \mu^k | \le B^k_t \right) &\ge 1-\delta'' \\ + \pr\left(\forall k \le K, t < T. |\hat \mu^k_t - \mu^k | \le B^k_t \right) &\ge 1-\delta'' \\ \text{where} \quad B^k_t &:= \sqrt{\frac{\ln(2TK/\delta'')}{2N^k_t}}. \end{aligned} $$ @@ -748,7 +742,7 @@ In this case, upon viewing some reward, we can exactly calculate the **posterior $$ \begin{aligned} - \P(\boldsymbol{\mu} \mid a_0, r_0) &\propto \P(r_0 \mid a_0, \boldsymbol{\mu}) \P(a_0 \mid \boldsymbol{\mu}) \P(\boldsymbol{\mu}) \\ + \pr(\boldsymbol{\mu} \mid a_0, r_0) &\propto \pr(r_0 \mid a_0, \boldsymbol{\mu}) \pr(a_0 \mid \boldsymbol{\mu}) \pr(\boldsymbol{\mu}) \\ &\propto (\mu^{a_0})^{r_0} (1 - \mu^{a_0})^{1-r_0}. \end{aligned} $$ @@ -803,6 +797,161 @@ the *constant factor* is optimal as well. +++ +## Contextual bandits + +In the above MAB environment, the reward distributions of the arms +remain constant. However, in many real-world settings, we might receive +additional information that affects these distributions. For example, in +the online advertising case where each arm corresponds to an ad we could +show the user, we might receive information about the user's preferences +that changes how likely they are to click on a given ad. We can model +such environments using **contextual bandits**. + +:::{prf:definition} Contextual bandit +:label: contextual_bandit + +At each timestep $t$, a new *context* +$x_t$ is drawn from some distribution $\nu_{\text{x}}$. The learner gets +to observe the context, and choose an action $a_t$ according to some +context-dependent policy $\pi_t(x_t)$. Then, the learner observes the +reward from the chosen arm $r_t \sim \nu^{a_t}(x_t)$. The reward +distribution also depends on the context. +::: + ++++ + +Assuming our context is *discrete*, we can just perform the same +algorithms, treating each context-arm pair as its own arm. This gives us +an enlarged MAB of $K |\mathcal{X}|$ arms. + +:::{attention} +Write down the UCB algorithm for this enlarged MAB. That is, write an +expression for $\pi_t(x_t) = \arg\max_a \dots$. +::: + +Recall that running UCB for $T$ timesteps on an MAB with $K$ arms +achieves a regret bound of $\tilde{O}(\sqrt{TK})$. So in this problem, +we would achieve regret $\tilde{O}(\sqrt{TK|\mathcal{X}|})$ in the +contextual MAB, which has a polynomial dependence on $|\mathcal{X}|$. +But in a situation where we have large, or even infinitely many +contexts, e.g. in the case where our context is a continuous value, this +becomes intractable. + +Note that this "enlarged MAB" treats the different contexts as entirely +unrelated to each other, while in practice, often contexts are *related* +to each other in some way: for example, we might want to advertise +similar products to users with similar preferences. How can we +incorporate this structure into our solution? + ++++ + +(lin_ucb)= +### Linear contextual bandits + +We want to model the *mean reward* of arm $k$ as a function of the +context, i.e. $\mu^k(x)$. One simple model is the *linear* one: +$\mu^k(x) = x^\top \theta^k$, where $x \in \mathcal{X} = \mathbb{R}^d$ and +$\theta^k \in \mathbb{R}^d$ describes a *feature direction* for arm $k$. Recall +that **supervised learning** gives us a way to estimate a conditional +expectation from samples: We learn a *least squares* estimator from the +timesteps where arm $k$ was selected: +$$\hat \theta_t^k = \arg\min_{\theta \in \mathbb{R}^d} \sum_{\{ i \in [t] : a_i = k \}} (r_i - x_i^\top \theta)^2.$$ +This has the closed-form solution known as the *ordinary least squares* +(OLS) estimator: + +:::{math} +:label: ols_bandit + +\begin{aligned} + \hat \theta_t^k & = (A_t^k)^{-1} \sum_{\{ i \in [t] : a_i = k \}} x_i r_i \\ + \text{where} \quad A_t^k & = \sum_{\{ i \in [t] : a_i = k \}} x_i x_i^\top. +\end{aligned} +::: + +We can now apply the UCB algorithm in this environment in order to +balance *exploration* of new arms and *exploitation* of arms that we +believe to have high reward. But how should we construct the upper +confidence bound? Previously, we treated the pulls of an arm as i.i.d. +samples and used Hoeffding's inequality to bound the distance of the +sample mean, our estimator, from the true mean. However, now our +estimator is not a sample mean, but rather the OLS estimator above {eq}`ols_bandit`. Instead, we'll use **Chebyshev's +inequality** to construct an upper confidence bound. + +:::{prf:theorem} Chebyshev's inequality +:label: chebyshev + +For a random variable $Y$ such that +$\E Y = 0$ and $\E Y^2 = \sigma^2$, +$$|Y| \le \beta \sigma \quad \text{with probability} \ge 1 - \frac{1}{\beta^2}$$ +::: + +Since the OLS estimator is known to be unbiased (try proving this +yourself), we can apply Chebyshev's inequality to +$x_t^\top (\hat \theta_t^k - \theta^k)$: + +$$\begin{aligned} + x_t^\top \theta^k \le x_t^\top \hat \theta_t^k + \beta \sqrt{x_t^\top (A_t^k)^{-1} x_t} \quad \text{with probability} \ge 1 - \frac{1}{\beta^2} +\end{aligned}$$ + +:::{attention} +We haven't explained why $x_t^\top (A_t^k)^{-1} x_t$ is the correct +expression for the variance of $x_t^\top \hat \theta_t^k$. This result +follows from some algebra on the definition of the OLS estimator {eq}`ols_bandit`. +::: + +The first term is exactly our predicted reward $\hat \mu^k_t(x_t)$. To +interpret the second term, note that +$$x_t^\top (A_t^k)^{-1} x_t = \frac{1}{N_t^k} x_t^\top (\Sigma_t^k)^{-1} x_t,$$ +where +$$\Sigma_t^k = \frac{1}{N_t^k} \sum_{\{ i \in [t] : a_i = k \}} x_i x_i^\top$$ +is the empirical covariance matrix of the contexts (assuming that the +context has mean zero). That is, the learner is encouraged to choose +arms when $x_t$ is *not aligned* with the data seen so far, or if arm +$k$ has not been explored much and so $N_t^k$ is small. + +We can now substitute these quantities into UCB to get the **LinUCB** +algorithm: + +```{code-cell} +class LinUCBPseudocode(Agent): + def __init__( + self, K: int, T: int, D: int, lam: float, get_c: Callable[[int], float] + ): + super().__init__(K, T) + self.lam = lam + self.get_c = get_c + self.contexts = [None for _ in range(K)] + self.A = np.repeat(lam * np.eye(D)[...], K) + self.targets = np.zeros(K, D) + self.w = np.zeros(K, D) + + def choose_arm(self, context: Float[Array, " D"]): + c = self.get_c(self.count) + scores = self.w @ context + c * np.sqrt( + context.T @ np.linalg.solve(self.A, context) + ) + return random_argmax(scores) + + def update_history(self, context: Float[Array, " D"], arm: int, reward: int): + self.A[arm] += np.outer(context, context) + self.targets[arm] += context * reward + self.w[arm] = np.linalg.solve(self.A[arm], self.targets[arm]) +``` + +:::{attention} +Note that the matrix $A_t^k$ above might not be invertible. When does this occur? One way to address this is to include a $\lambda I$ regularization term to ensure that $A_t^k$ is invertible. This is equivalent to solving a *ridge regression* problem instead of the unregularized least squares problem. Implement this solution. TODO SOLUTION CURRENTLY SHOWN +::: + ++++ + +$c_t$ is similar to the $\log (2t/\delta')$ term of UCB: It controls the +width of the confidence interval. Here, we treat it as a tunable +parameter, though in a theoretical analysis, it would depend on $A_t^k$ +and the probability $\delta$ with which the bound holds. + +Using similar tools for UCB, we can also prove an $\tilde{O}(\sqrt{T})$ +regret bound. The full details of the analysis can be found in Section 3 of {cite}`agarwal_reinforcement_2022`. + ## Summary In this chapter, diff --git a/book/bibliography.md b/book/bibliography.md deleted file mode 100644 index d2b9c3c..0000000 --- a/book/bibliography.md +++ /dev/null @@ -1,5 +0,0 @@ -# Bibliography - -```{bibliography} -``` - diff --git a/book/contextual_bandits.md b/book/contextual_bandits.md deleted file mode 100644 index 1afe425..0000000 --- a/book/contextual_bandits.md +++ /dev/null @@ -1,168 +0,0 @@ ---- -jupytext: - text_representation: - extension: .md - format_name: myst - format_version: 0.13 - jupytext_version: 1.16.2 -kernelspec: - display_name: Python 3 (ipykernel) - language: python - name: python3 ---- - -(contextual_bandits)= -# Contextual bandits - -In the above MAB environment, the reward distributions of the arms -remain constant. However, in many real-world settings, we might receive -additional information that affects these distributions. For example, in -the online advertising case where each arm corresponds to an ad we could -show the user, we might receive information about the user's preferences -that changes how likely they are to click on a given ad. We can model -such environments using **contextual bandits**. - -:::{prf:definition} Contextual bandit -:label: contextual_bandit - -At each timestep $t$, a new *context* -$x_t$ is drawn from some distribution $\nu_{\text{x}}$. The learner gets -to observe the context, and choose an action $a_t$ according to some -context-dependent policy $\pi_t(x_t)$. Then, the learner observes the -reward from the chosen arm $r_t \sim \nu^{a_t}(x_t)$. The reward -distribution also depends on the context. -::: - -+++ - -Assuming our context is *discrete*, we can just perform the same -algorithms, treating each context-arm pair as its own arm. This gives us -an enlarged MAB of $K |\mathcal{X}|$ arms. - -:::{attention} -Write down the UCB algorithm for this enlarged MAB. That is, write an -expression for $\pi_t(x_t) = \argmax_a \dots$. -::: - -Recall that running UCB for $T$ timesteps on an MAB with $K$ arms -achieves a regret bound of $\tilde{O}(\sqrt{TK})$. So in this problem, -we would achieve regret $\tilde{O}(\sqrt{TK|\mathcal{X}|})$ in the -contextual MAB, which has a polynomial dependence on $|\mathcal{X}|$. -But in a situation where we have large, or even infinitely many -contexts, e.g. in the case where our context is a continuous value, this -becomes intractable. - -Note that this "enlarged MAB" treats the different contexts as entirely -unrelated to each other, while in practice, often contexts are *related* -to each other in some way: for example, we might want to advertise -similar products to users with similar preferences. How can we -incorporate this structure into our solution? - -+++ - -(lin_ucb)= -## Linear contextual bandits - -We want to model the *mean reward* of arm $k$ as a function of the -context, i.e. $\mu^k(x)$. One simple model is the *linear* one: -$\mu^k(x) = x^\top \theta^k$, where $x \in \mathcal{X} = \mathbb{R}^d$ and -$\theta^k \in \mathbb{R}^d$ describes a *feature direction* for arm $k$. Recall -that **supervised learning** gives us a way to estimate a conditional -expectation from samples: We learn a *least squares* estimator from the -timesteps where arm $k$ was selected: -$$\hat \theta_t^k = \argmin_{\theta \in \mathbb{R}^d} \sum_{\{ i \in [t] : a_i = k \}} (r_i - x_i^\top \theta)^2.$$ -This has the closed-form solution known as the *ordinary least squares* -(OLS) estimator: - -:::{math} -:label: ols_bandit - -\begin{aligned} - \hat \theta_t^k & = (A_t^k)^{-1} \sum_{\{ i \in [t] : a_i = k \}} x_i r_i \\ - \text{where} \quad A_t^k & = \sum_{\{ i \in [t] : a_i = k \}} x_i x_i^\top. -\end{aligned} -::: - -We can now apply the UCB algorithm in this environment in order to -balance *exploration* of new arms and *exploitation* of arms that we -believe to have high reward. But how should we construct the upper -confidence bound? Previously, we treated the pulls of an arm as i.i.d. -samples and used Hoeffding's inequality to bound the distance of the -sample mean, our estimator, from the true mean. However, now our -estimator is not a sample mean, but rather the OLS estimator above {eq}`ols_bandit`. Instead, we'll use **Chebyshev's -inequality** to construct an upper confidence bound. - -:::{prf:theorem} Chebyshev's inequality -:label: chebyshev - -For a random variable $Y$ such that -$\E Y = 0$ and $\E Y^2 = \sigma^2$, -$$|Y| \le \beta \sigma \quad \text{with probability} \ge 1 - \frac{1}{\beta^2}$$ -::: - -Since the OLS estimator is known to be unbiased (try proving this -yourself), we can apply Chebyshev's inequality to -$x_t^\top (\hat \theta_t^k - \theta^k)$: - -$$\begin{aligned} - x_t^\top \theta^k \le x_t^\top \hat \theta_t^k + \beta \sqrt{x_t^\top (A_t^k)^{-1} x_t} \quad \text{with probability} \ge 1 - \frac{1}{\beta^2} -\end{aligned}$$ - -:::{attention} -We haven't explained why $x_t^\top (A_t^k)^{-1} x_t$ is the correct -expression for the variance of $x_t^\top \hat \theta_t^k$. This result -follows from some algebra on the definition of the OLS estimator {eq}`ols_bandit`. -::: - -The first term is exactly our predicted reward $\hat \mu^k_t(x_t)$. To -interpret the second term, note that -$$x_t^\top (A_t^k)^{-1} x_t = \frac{1}{N_t^k} x_t^\top (\Sigma_t^k)^{-1} x_t,$$ -where -$$\Sigma_t^k = \frac{1}{N_t^k} \sum_{\{ i \in [t] : a_i = k \}} x_i x_i^\top$$ -is the empirical covariance matrix of the contexts (assuming that the -context has mean zero). That is, the learner is encouraged to choose -arms when $x_t$ is *not aligned* with the data seen so far, or if arm -$k$ has not been explored much and so $N_t^k$ is small. - -We can now substitute these quantities into UCB to get the **LinUCB** -algorithm: - -```{code-cell} -class LinUCBPseudocode(Agent): - def __init__( - self, K: int, T: int, D: int, lam: float, get_c: Callable[[int], float] - ): - super().__init__(K, T) - self.lam = lam - self.get_c = get_c - self.contexts = [None for _ in range(K)] - self.A = np.repeat(lam * np.eye(D)[...], K) - self.targets = np.zeros(K, D) - self.w = np.zeros(K, D) - - def choose_arm(self, context: Float[Array, " D"]): - c = self.get_c(self.count) - scores = self.w @ context + c * np.sqrt( - context.T @ np.linalg.solve(self.A, context) - ) - return random_argmax(scores) - - def update_history(self, context: Float[Array, " D"], arm: int, reward: int): - self.A[arm] += np.outer(context, context) - self.targets[arm] += context * reward - self.w[arm] = np.linalg.solve(self.A[arm], self.targets[arm]) -``` - -:::{attention} -Note that the matrix $A_t^k$ above might not be invertible. When does this occur? One way to address this is to include a $\lambda I$ regularization term to ensure that $A_t^k$ is invertible. This is equivalent to solving a *ridge regression* problem instead of the unregularized least squares problem. Implement this solution. TODO SOLUTION CURRENTLY SHOWN -::: - -+++ - -$c_t$ is similar to the $\log (2t/\delta')$ term of UCB: It controls the -width of the confidence interval. Here, we treat it as a tunable -parameter, though in a theoretical analysis, it would depend on $A_t^k$ -and the probability $\delta$ with which the bound holds. - -Using similar tools for UCB, we can also prove an $\tilde{O}(\sqrt{T})$ -regret bound. The full details of the analysis can be found in Section 3 of {cite}`agarwal_reinforcement_2022`. diff --git a/book/control.md b/book/control.md index b5b62ea..1192a19 100644 --- a/book/control.md +++ b/book/control.md @@ -9,11 +9,15 @@ kernelspec: display_name: Python 3 (ipykernel) language: python name: python3 +math: + '\st': 'x' + '\act': 'u' --- -(lqr_chapter)= # Linear Quadratic Regulators +## Introduction + Up to this point, we have considered decision problems with finitely many states and actions. However, in many applications, states and actions may take on continuous values. For example, consider autonomous @@ -53,7 +57,7 @@ laws of physics, are highly complex. This task is equivalent to the classic control problem known as *CartPole*: :::{image} shared/cart_pole.png -:width: "40%" +:width: 200px ::: The state $\st \in \mathbb{R}^4$ can be described by: @@ -136,9 +140,8 @@ $$ ### A first attempt: Discretization Can we solve this problem using tools from the finite MDP setting? If -$\mathcal{S}$ and $\mathcal{A}$ were finite, then we'd be able to work backwards using the -DP algorithm for computing the optimal policy in an MDP -{prf:ref}`pi_star_dp`. This inspires us to try *discretizing* the +$\mathcal{S}$ and $\mathcal{A}$ were finite, then we'd be able to work backwards using the DP algorithm for computing the optimal policy in an MDP ([](#pi_star_dp)). +This inspires us to try *discretizing* the problem. Suppose $\mathcal{S}$ and $\mathcal{A}$ are bounded, that is, @@ -280,7 +283,7 @@ $$ In this section, we'll compute the optimal value function $V^\star_h$, Q-function $Q^\star_h$, and policy $\pi^\star_h$ in the LQR setting {prf:ref}`lqr_definition` using -**dynamic programming** in a very similar way to the DP algorithms [in the MDP setting](eval_dp): +**dynamic programming** in a very similar way to the DP algorithms [in the MDP setting](#eval_dp): 1. We'll compute $V_H^\star$ (at the end of the horizon) as our base case. @@ -953,10 +956,11 @@ definite. Note that Hessian matrices are generally symmetric, so we can apply this process to $Q$ and $R$ to obtain the positive definite approximations -$\widetilde{Q}$ and $\widetilde{R}$. Now that we have a upward-curved +$\widetilde{Q}$ and $\widetilde{R}$. +Now that we have a upward-curved quadratic approximation to the cost function, and a linear approximation to the state transitions, we can simply apply the time-homogenous LQR -methods from [](optimal_lqr). +methods from [](#optimal_lqr). But what happens when we enter states far away from $\st^\star$ or want to use actions far from $\act^\star$? A Taylor approximation is only diff --git a/book/exploration.md b/book/exploration.md index 24e4736..5c3b678 100644 --- a/book/exploration.md +++ b/book/exploration.md @@ -11,14 +11,13 @@ kernelspec: name: python3 --- -(exploration)= # Exploration in MDPs ## Introduction -One of the key challenges of reinforcement learning is the *exploration-exploitation tradeoff*. Should we *exploit* actions we know will give high reward, or should we *explore* different actions to discover potentially better strategies? An algorithm that doesn't explore effectively might easily *overfit* to certain areas of the state space, and fail to generalize once they enter a region they haven't yet seen. The algorithms we saw in the chapter on fitted DP {ref}`fitted_dp` suffer from this issue. +One of the key challenges of reinforcement learning is the *exploration-exploitation tradeoff*. Should we *exploit* actions we know will give high reward, or should we *explore* different actions to discover potentially better strategies? An algorithm that doesn't explore effectively might easily *overfit* to certain areas of the state space, and fail to generalize once they enter a region they haven't yet seen. The algorithms we saw in the chapter on fitted DP [](./fitted_dp.md) suffer from this issue. -In the multi-armed bandits chapter {ref}`bandits`, where the state never changes so all we care about are the actions, we saw algorithms like UCB {ref}`ucb` and Thompson sampling {ref}`thompson_sampling` that incentivize the learner to explore arms that it is uncertain about. In this chapter, we will see how to generalize these ideas to the MDP setting. +In [](./bandits.md), where the state never changes so all we care about are the actions, we saw algorithms like [](#ucb) and [Thompson sampling](#thompson_sampling) that incentivize the learner to explore arms that it is uncertain about. In this chapter, we will see how to generalize these ideas to the MDP setting. :::{prf:definition} Per-episode regret :label: per_episode_regret @@ -48,7 +47,7 @@ There are $|\mathcal{S}|$ states. The agent starts in the leftmost state. In eve ### Exploration in deterministic MDPs -Let us address the exploration problem in a *deterministic* MDP where taking action $a$ in state $s$ always leads to the state $P(s, a) \in \mathcal{S}$. In this simple setting, there will be no "automatic" exploration due to randomness, so our strategy must actively explore new states. One simple strategy is to visit every possible state-action pair to learn the entire MDP. Then, once the MDP is known, we can use DP to solve for the optimal policy. (This should remind you of the {ref}`etc` algorithm.) +Let us address the exploration problem in a *deterministic* MDP where taking action $a$ in state $s$ always leads to the state $P(s, a) \in \mathcal{S}$. In this simple setting, there will be no "automatic" exploration due to randomness, so our strategy must actively explore new states. One simple strategy is to visit every possible state-action pair to learn the entire MDP. Then, once the MDP is known, we can use DP to solve for the optimal policy. (This should remind you of the [](#etc) algorithm.) ::::{prf:definition} Explore-then-exploit (for deterministic MDPs) :label: explore_then_exploit @@ -62,20 +61,27 @@ $K \gets \emptyset$ Using our known transitions $K$, compute the shortest path $ The shortest path computation can be implemented using DP. We leave this as an exercise. :::: -```{code-cell} -def explore_then_exploit(mdp: MDP): -``` +:::{prf:theorem} Performance of explore-then-exploit +:label: explore_then_exploit_performance -:::{prf:theorem} -Performance of explore-then-exploitexplore_then_exploit_performance As long as every state can be reached from $s_0$ within a single episode, i.e. $|\mathcal{S}| \le \hor$, this will eventually be able to explore all $|\mathcal{S}| |\mathcal{A}|$ state-action pairs, adding one new transition per episode. We know it will take at most $|\mathcal{S}| |\mathcal{A}|$ iterations to explore the entire MDP, after which $\pi^t = \pi^\star$, incurring no additional regret. For each $\pi^t$ up until then, corresponding to the shortest-path policies $\tilde \pi$, the value of policy $\pi^t$ will differ from that of $\pi^\star$ by at most $\hor$, since the policies will differ by at most $1$ reward at each timestep. So, $$\sum_{t=0}^{T-1} V^\star_0 - V_0^{\pi^t} \le |\mathcal{S}||\mathcal{A}| \hor.$$ (Note that this MDP and algorithm are deterministic, so the regret is not random.) +As long as every state can be reached from $s_0$ within a single episode, i.e. $|\mathcal{S}| \le \hor$, this will eventually be able to explore all $|\mathcal{S}| |\mathcal{A}|$ state-action pairs, adding one new transition per episode. We know it will take at most $|\mathcal{S}| |\mathcal{A}|$ iterations to explore the entire MDP, after which $\pi^t = \pi^\star$, incurring no additional regret. +For each $\pi^t$ up until then, corresponding to the shortest-path policies $\tilde \pi$, the value of policy $\pi^t$ will differ from that of $\pi^\star$ by at most $\hor$, since the policies will differ by at most $1$ reward at each timestep. So, + +$$\sum_{t=0}^{T-1} V^\star_0 - V_0^{\pi^t} \le |\mathcal{S}||\mathcal{A}| \hor.$$ + +(Note that this MDP and algorithm are deterministic, so the regret is not random.) ::: (mdp_mab)= ## Treating an unknown MDP as a MAB -We also explored the exploration-exploitation tradeoff in the chapter on {ref}`bandits`. Recall tthat in the MAB setting, we have $K$ arms, each of which has an unknown reward distribution, and we want to learn which of the arms is *optimal*, i.e. has the highest mean reward. +We also explored the exploration-exploitation tradeoff in [](./bandits.md). Recall tthat in the MAB setting, we have $K$ arms, each of which has an unknown reward distribution, and we want to learn which of the arms is *optimal*, i.e. has the highest mean reward. + +One algorithm that struck a good balance between exploration and exploitation was the **upper confidence bound** algorithm [](#ucb): For each arm, we construct a *confidence interval* for its true mean award, and then choose the arm with the highest upper confidence bound. In summary, + +$$k_{t+1} \gets \arg\max_{k \in [K]} \frac{R^{k}_t}{N^{k}_t} + \sqrt{\frac{\ln(2t/\delta)}{2 N^{k}_t}}$$ -One algorithm that struck a good balance between exploration and exploitation was the **upper confidence bound** algorithm {ref}`ucb`: For each arm, we construct a *confidence interval* for its true mean award, and then choose the arm with the highest upper confidence bound. In summary, $$k_{t+1} \gets \arg\max_{k \in [K]} \frac{R^{k}_t}{N^{k}_t} + \sqrt{\frac{\ln(2t/\delta)}{2 N^{k}_t}}$$ where $N_t^k$ indicates the number of times arm $k$ has been pulled up until time $t$, $R_t^k$ indicates the total reward obtained by pulling arm $k$ up until time $t$, and $\delta > 0$ controls the width of the confidence interval. How might we extend UCB to the MDP case? +where $N_t^k$ indicates the number of times arm $k$ has been pulled up until time $t$, $R_t^k$ indicates the total reward obtained by pulling arm $k$ up until time $t$, and $\delta > 0$ controls the width of the confidence interval. How might we extend UCB to the MDP case? Let us formally describe an unknown MDP as an MAB problem. In an unknown MDP, we want to learn which *policy* is optimal. So if we want to apply MAB techniques to solving an MDP, it makes sense to think of *arms* as *policies*. There are $K = (|\mathcal{A}|^{|\mathcal{S}|})^\hor$ deterministic policies in a finite MDP. Then, "pulling" arm $\pi$ corresponds to using $\pi$ to act through a trajectory in the MDP, and observing the total reward. @@ -115,18 +121,26 @@ At a high level, the UCB-VI algorithm can be described as follows: 2. **Reward bonus:** Design a reward bonus $b_\hi(s, a) \in \mathbb{R}$ to encourage exploration, analogous to the UCB term. -3. **Optimistic planning:** Use DP to compute the optimal policy $\hat \pi_\hi(s)$ in the modelled MDP $$\tilde{\mathcal{M}} = (\mathcal{S}, \mathcal{A}, \{ \hat{P}_\hi \}_{h \in [H]}, \{ r_\hi + b_\hi \}_{h \in [H]}, H).$$ +3. **Optimistic planning:** Use DP to compute the optimal policy $\hat \pi_\hi(s)$ in the modelled MDP + +$$\tilde{\mathcal{M}} = (\mathcal{S}, \mathcal{A}, \{ \hat{P}_\hi \}_{h \in [H]}, \{ r_\hi + b_\hi \}_{h \in [H]}, H).$$ 4. **Execution:** Use $\hat \pi_\hi(s)$ to collect a new trajectory, and repeat. -We detail each of these steps below. The full definition follows in {prf:ref}`ucb_vi`. +We detail each of these steps below. The full definition follows in [](#ucb-vi-alg). ### Modelling the transitions -We seek to approximate $P_\hi(s_{h+1} \mid s_\hi, a_\hi) = \frac{\P(s_\hi, a_\hi, s_{h+1})}{\P(s_\hi, a_\hi)}$. We can estimate these using their sample probabilities from the dataset. That is, define $$\begin{aligned} +We seek to approximate $P_\hi(s_{h+1} \mid s_\hi, a_\hi) = \frac{\pr(s_\hi, a_\hi, s_{h+1})}{\pr(s_\hi, a_\hi)}$. We can estimate these using their sample probabilities from the dataset. That is, define + +$$\begin{aligned} N_\hi^t(s, a, s') & := \sum_{i=0}^{t-1} \ind{ (s_\hi^i, a_\hi^i, s_{h+1}^i) = (s, a, s') } \\ N_\hi^t(s, a) & := \sum_{i=0}^{t-1} \ind{ (s_\hi^i, a_\hi^i) = (s, a) } \\ -\end{aligned}$$ Then we can model $$\hat{P}_\hi^t(s' \mid s, a) = \frac{N_\hi^t(s, a, s')}{N_\hi^t(s, a)}.$$ +\end{aligned}$$ + +Then we can model + +$$\hat{P}_\hi^t(s' \mid s, a) = \frac{N_\hi^t(s, a, s')}{N_\hi^t(s, a)}.$$ :::{prf:remark} Note that this is also a fairly naive, nonparametric estimator that doesn't assume any underlying structure of the MDP. We'll see how to incorporate assumptions about the MDP in the following section. @@ -142,13 +156,29 @@ To motivate the reward bonus term $b_\hi^t(s, a)$, recall how we designed the re 3. To make this bound *uniform* across all timesteps $t \in [T]$, we applied the union bound and multiplied $\delta$ by a factor of $T$. -We'd like to do the same for UCB-VI, and construct the bonus term such that $V^\star_\hi(s) \le \hat{V}_\hi^t(s)$ with high probability. However, our construction will be more complex than the MAB case, since $\hat{V}_\hi^t(s)$ depends on the bonus $b_\hi^t(s, a)$ implicitly via DP. We claim that the bonus term that gives the proper bound is $$b_\hi^t(s, a) = 2 H \sqrt{\frac{\log( |\mathcal{S}||\mathcal{A}|H T/\delta )}{N_\hi^t(s, a)}}. - \label{eq:ucb_vi_bonus}$$ We will only provide a heuristic sketch of the proof; see `\cite[Section 7.3]{agarwal_reinforcement_2022}`{=latex} for a full proof. +We'd like to do the same for UCB-VI, and construct the bonus term such that $V^\star_\hi(s) \le \hat{V}_\hi^t(s)$ with high probability. However, our construction will be more complex than the MAB case, since $\hat{V}_\hi^t(s)$ depends on the bonus $b_\hi^t(s, a)$ implicitly via DP. We claim that the bonus term that gives the proper bound is -:::{prf:remark} UCB-VI reward bonus construction +$$b_\hi^t(s, a) = 2 H \sqrt{\frac{\log( |\mathcal{S}||\mathcal{A}|H T/\delta )}{N_\hi^t(s, a)}}. +\label{eq:ucb_vi_bonus}$$ + +We will only provide a heuristic sketch of the proof; see {cite}`agarwal_reinforcement_2022` (Section 7.3) for a full proof. + +::::{prf:remark} UCB-VI reward bonus construction :label: ucb_vi_bonus -We aim to show that, with high probability, $$V_\hi^\star(s) \le \hat{V}_\hi^t(s) \quad \forall t \in [T], h \in [H], s \in \mathcal{S}.$$ We'll do this by bounding the error incurred at each step of DP. Recall that DP solves for $\hat{V}_\hi^t(s)$ recursively as follows: $$\hat{V}_\hi^t(s) = \max_{a \in \mathcal{A}} \left[ \tilde r^t_\hi(s, a) + \E_{s' \sim \hat{P}_\hi^t(\cdot \mid s, a)} \left[ \hat{V}_{h+1}^t(s') \right] \right]$$ where $\tilde r^t_\hi(s, a) = r_\hi(s, a) + b_\hi^t(s, a)$ is the reward function of our modelled MDP $\tilde{\mathcal{M}}^t$. On the other hand, we know that $V^\star$ must satisfy $$V^\star_\hi(s) = \max_{a \in \mathcal{A}} \left[ \tilde r^t_\hi(s, a) + \E_{s' \sim P^?_\hi(\cdot \mid s, a)} [V^\star_{\hi+1}(s')] \right]$$ so it suffices to bound the difference between the two inner expectations. There are two sources of error: +We aim to show that, with high probability, + +$$V_\hi^\star(s) \le \hat{V}_\hi^t(s) \quad \forall t \in [T], h \in [H], s \in \mathcal{S}.$$ + +We'll do this by bounding the error incurred at each step of DP. Recall that DP solves for $\hat{V}_\hi^t(s)$ recursively as follows: + +$$\hat{V}_\hi^t(s) = \max_{a \in \mathcal{A}} \left[ \tilde r^t_\hi(s, a) + \E_{s' \sim \hat{P}_\hi^t(\cdot \mid s, a)} \left[ \hat{V}_{h+1}^t(s') \right] \right]$$ + +where $\tilde r^t_\hi(s, a) = r_\hi(s, a) + b_\hi^t(s, a)$ is the reward function of our modelled MDP $\tilde{\mathcal{M}}^t$. On the other hand, we know that $V^\star$ must satisfy + +$$V^\star_\hi(s) = \max_{a \in \mathcal{A}} \left[ \tilde r^t_\hi(s, a) + \E_{s' \sim P^?_\hi(\cdot \mid s, a)} [V^\star_{\hi+1}(s')] \right]$$ + +so it suffices to bound the difference between the two inner expectations. There are two sources of error: 1. The value functions $\hat{V}^t_{h+1}$ v.s. $V^\star_{h+1}$ @@ -162,39 +192,58 @@ We can bound these individually, and then combine them by the triangle inequalit \text{error} = \left| \E_{s' \sim \hat{P}_\hi^t(\cdot \mid s, a)} \left[ V^\star_{h+1}(s') \right] - \E_{s' \sim P^?_\hi(\cdot \mid s, a)} \left[ V^\star_{h+1}(s') \right]. \right| ::: -Let us bound this term for a fixed $s, a, h, t$. (Later we can make this uniform across $s, a, h, t$ using the union bound.) Note that expanding out the definition of $\hat{P}_\hi^t$ gives $$\begin{aligned} +Let us bound this term for a fixed $s, a, h, t$. (Later we can make this uniform across $s, a, h, t$ using the union bound.) Note that expanding out the definition of $\hat{P}_\hi^t$ gives + +$$\begin{aligned} \E_{s' \sim \hat{P}_\hi^t(\cdot \mid s, a)} \left[ V^\star_{h+1}(s') \right] & = \sum_{s' \in \mathcal{S}} \frac{N^t_\hi(s, a, s')}{N^t_\hi(s, a)} V^\star_{h+1}(s') \\ & = \frac{1}{N^t_\hi(s, a)} \sum_{i=0}^{t-1} \sum_{s' \in \mathcal{S}} \ind{ (s_\hi^i, a_\hi^i, s_{h+1}^i) = (s, a, s') } V^\star_{h+1}(s') \\ & = \frac{1}{N^t_\hi(s, a)} \sum_{i=0}^{t-1} \underbrace{\ind{ (s_\hi^i, a_\hi^i) = (s, a) } V^\star_{h+1}(s_{h+1}^i)}_{X^i} - -\end{aligned}$$ since the terms where $s' \neq s_{h+1}^i$ vanish. +\end{aligned}$$ + +since the terms where $s' \neq s_{h+1}^i$ vanish. + +Now, in order to apply Hoeffding's inequality, we would like to express the second term in {eq}`err` as a sum over $t$ random variables as well. We will do this by redundantly averaging over all desired trajectories (i.e. where we visit state $s$ and action $a$ at time $h$): -Now, in order to apply Hoeffding's inequality, we would like to express the second term in {eq}`err` as a sum over $t$ random variables as well. We will do this by redundantly averaging over all desired trajectories (i.e. where we visit state $s$ and action $a$ at time $h$): $$\begin{aligned} +$$\begin{aligned} \E_{s' \sim P^?_\hi(\cdot \mid s, a)} \left[ V^\star_{h+1}(s') \right] & = \sum_{s' \in \mathcal{S}} P^?_\hi(s' \mid s, a) V^\star_{h+1}(s') \\ & = \sum_{s' \in \mathcal{S}} \frac{1}{N^t_\hi(s, a)} \sum_{i=0}^{t-1} \ind{ (s_\hi^i, a_\hi^i) = (s, a) } P^?_\hi(s' \mid s, a) V^\star_{h+1}(s') \\ & = \frac{1}{N^t_\hi(s, a)} \sum_{i=0}^{t-1} \E_{s_{h+1}^i \sim P^?_{h}(\cdot \mid s_\hi^i, a_\hi^i)} X^i. - -\end{aligned}$$ Now we can apply Hoeffding's inequality to $X^i - \E_{s_{h+1}^i \sim P^?_{h}(\cdot \mid s_\hi^i, a_\hi^i)} X^i$, which is bounded by $\hor$, to obtain that, with probability at least $1-\delta$, $$\text{error} = \left| \frac{1}{N^t_\hi(s, a)} \sum_{i=0}^{t-1} \left(X^i - \E_{s_{h+1}^i \sim P^?_{h}(\cdot \mid s_\hi^i, a_\hi^i)} X^i \right) \right| \le 2 H \sqrt{\frac{\ln(1/\delta)}{N_\hi^t(s, a)}}.$$ Applying a union bound over all $s \in \mathcal{S}, a \in \mathcal{A}, t \in [T], h \in [H]$ gives the $b_\hi^t(s, a)$ term above. -::: +\end{aligned} +$$ + +Now we can apply Hoeffding's inequality to $X^i - \E_{s_{h+1}^i \sim P^?_{h}(\cdot \mid s_\hi^i, a_\hi^i)} X^i$, which is bounded by $\hor$, to obtain that, with probability at least $1-\delta$, + +$$ +\text{error} = \left| \frac{1}{N^t_\hi(s, a)} \sum_{i=0}^{t-1} \left(X^i - \E_{s_{h+1}^i \sim P^?_{h}(\cdot \mid s_\hi^i, a_\hi^i)} X^i \right) \right| \le 2 H \sqrt{\frac{\ln(1/\delta)}{N_\hi^t(s, a)}}. +$$ + +Applying a union bound over all $s \in \mathcal{S}, a \in \mathcal{A}, t \in [T], h \in [H]$ gives the $b_\hi^t(s, a)$ term above. +:::: ### Definition Putting these parts together, we can define the algorithm as follows: -::::{prf:definition} UCB-VI -:label: ucb_vi +:::{math} +:label: ucb-vi-alg +3 + 1 = 4 +::: + -:::: ### Performance of UCB-VI How exactly does UCB-VI strike a good balance between exploration and exploitation? In UCB for MABs, the bonus exploration term is simple to interpret: It encourages the learner to take actions with a high exploration term. Here, the policy depends on the bonus term indirectly: The policy is obtained by planning in an MDP where the bonus term is added to the reward function. Note that the bonuses *propagate backwards* in DP, effectively enabling the learner to *plan to explore* unknown states. This effect takes some further interpretation. -Recall we constructed $b^t_\hi$ so that, with high probability, $V^\star_\hi(s) \le \hat{V}_\hi^t(s)$ and so $$V^\star_\hi(s) - V^{\pi^t}_\hi(s) \le \hat{V}_\hi^t(s) - V^{\pi^t}_\hi(s).$$ That is, the l.h.s. measures how suboptimal policy $\pi^t$ is in the true environment, while the r.h.s. is the difference in the policy's value when acting in the modelled MDP $\tilde{\mathcal{M}}^t$ instead of the true one $\mathcal{M}^{?}$. +Recall we constructed $b^t_\hi$ so that, with high probability, $V^\star_\hi(s) \le \hat{V}_\hi^t(s)$ and so + +$$V^\star_\hi(s) - V^{\pi^t}_\hi(s) \le \hat{V}_\hi^t(s) - V^{\pi^t}_\hi(s).$$ + +That is, the l.h.s. measures how suboptimal policy $\pi^t$ is in the true environment, while the r.h.s. is the difference in the policy's value when acting in the modelled MDP $\tilde{\mathcal{M}}^t$ instead of the true one $\mathcal{M}^{?}$. If the r.h.s. is *small*, this implies that the l.h.s. difference is also small, i.e. that $\pi^t$ is *exploiting* actions that are giving high reward. @@ -208,7 +257,11 @@ It turns out that UCB-VI achieves a per-episode regret of $$\E \left[ \sum_{t=0}^{T-1} \left(V^\star_0(s_0) - V^{\pi^t}_0(s_0) \right) \right] = \tilde{O}(H^2 \sqrt{|\mathcal{S}| |\mathcal{A}| T})$$ ::: -Comparing this to the UCB regret bound $\tilde{O}(\sqrt{T K})$, where $K$ is the number of arms of the MAB, we see that we've reduced the number of effective arms from $|\mathcal{A}|^{|\mathcal{S}|\hor}$ (in {eq}`mdp_as_mab`) to $H^4 |\mathcal{S}||\mathcal{A}|$, which is indeed polynomial in $|\mathcal{S}|$, $|\mathcal{A}|$, and $H$, as desired. This is also roughly the number of episodes it takes to achieve constant-order average regret: $$\frac{1}{T} \E[\text{Regret}_T] = \tilde{O}\left(\sqrt{\frac{H^4 |\mathcal{S}||\mathcal{A}|}{T}}\right)$$ Note that the time-dependent transition matrix has $H |\mathcal{S}|^2 |\mathcal{A}|$ entries. Assuming $H \ll |\mathcal{S}|$, this shows that it's possible to achieve low regret, and achieve a near-optimal policy, while only understanding a $1/|\mathcal{S}|$ fraction of the world's dynamics. +Comparing this to the UCB regret bound $\tilde{O}(\sqrt{T K})$, where $K$ is the number of arms of the MAB, we see that we've reduced the number of effective arms from $|\mathcal{A}|^{|\mathcal{S}|\hor}$ (in {eq}`mdp_as_mab`) to $H^4 |\mathcal{S}||\mathcal{A}|$, which is indeed polynomial in $|\mathcal{S}|$, $|\mathcal{A}|$, and $H$, as desired. This is also roughly the number of episodes it takes to achieve constant-order average regret: + +$$\frac{1}{T} \E[\text{Regret}_T] = \tilde{O}\left(\sqrt{\frac{H^4 |\mathcal{S}||\mathcal{A}|}{T}}\right)$$ + +Note that the time-dependent transition matrix has $H |\mathcal{S}|^2 |\mathcal{A}|$ entries. Assuming $H \ll |\mathcal{S}|$, this shows that it's possible to achieve low regret, and achieve a near-optimal policy, while only understanding a $1/|\mathcal{S}|$ fraction of the world's dynamics. ## Linear MDPs @@ -226,17 +279,23 @@ $$\begin{aligned} r_\hi(s, a) & = \phi(s, a)^\top \theta_\hi^\star \end{aligned}$$ -Note that we can also think of $P_\hi(\cdot \mid s, a) = \mu_\hi^\star$ as an $|\mathcal{S}| \times d$ matrix, and think of $\mu^\star_\hi(s')$ as indexing into the $s'$-th row of this matrix (treating it as a column vector). Thinking of $V^\star_{\hi+1}$ as an $|\mathcal{S}|$-dimensional vector, this allows us to write $$\E_{s' \sim P_\hi(\cdot \mid s, a)}[V^\star_{\hi+1}(s)] = (\mu^\star_\hi \phi(s, a))^\top V^\star_{\hi+1}.$$ The $\phi$ feature mapping can be designed to capture interactions between the state $s$ and action $a$. In this book, we'll assume that the feature map $\phi : \mathcal{S} \times \mathcal{A} \to \mathbb{R}^d$ and the reward function (described by $\theta_\hi^\star$) are known to the learner. +Note that we can also think of $P_\hi(\cdot \mid s, a) = \mu_\hi^\star$ as an $|\mathcal{S}| \times d$ matrix, and think of $\mu^\star_\hi(s')$ as indexing into the $s'$-th row of this matrix (treating it as a column vector). Thinking of $V^\star_{\hi+1}$ as an $|\mathcal{S}|$-dimensional vector, this allows us to write + +$$\E_{s' \sim P_\hi(\cdot \mid s, a)}[V^\star_{\hi+1}(s)] = (\mu^\star_\hi \phi(s, a))^\top V^\star_{\hi+1}.$$ + +The $\phi$ feature mapping can be designed to capture interactions between the state $s$ and action $a$. In this book, we'll assume that the feature map $\phi : \mathcal{S} \times \mathcal{A} \to \mathbb{R}^d$ and the reward function (described by $\theta_\hi^\star$) are known to the learner. ::: ### Planning in a linear MDP -It turns out that $Q^\star_\hi$ is also linear with respect to this feature mapping. We can prove this by simply computing it using DP. We initialize $V_{H}^\star(s) = 0 \forall s$. Then we iterate: $$\begin{aligned} +It turns out that $Q^\star_\hi$ is also linear with respect to this feature mapping. We can prove this by simply computing it using DP. We initialize $V_{H}^\star(s) = 0 \forall s$. Then we iterate: + +$$\begin{aligned} Q^\star_\hi(s, a) & = r_\hi(s, a) + \E_{s' \sim P_\hi(\cdot \mid s, a)} [V^\star_{h+1}(s')] \\ & = \phi(s, a)^\top \theta_\hi^\star + (\mu_\hi^\star \phi(s, a))^\top V^\star_{h+1} \\ & = \phi(s, a)^\top \underbrace{( \theta_\hi^\star + (\mu_\hi^\star)^\top V^\star_{h+1})}_{w_\hi} \\ V^\star_\hi(s) & = \max_a Q^\star_\hi(s, a) \\ - \pi^\star_\hi(s) & = \argmax_a Q^\star_\hi(s, a) + \pi^\star_\hi(s) & = \arg\max_a Q^\star_\hi(s, a) \end{aligned}$$ :::{attention} @@ -248,23 +307,41 @@ Show that $Q^\pi_\hi$ is also linear with respect to $\phi(s, a)$ for any policy #### Modelling the transitions -This linear assumption on the MDP will also allow us to model the unknown dynamics $P^?_\hi(s' \mid s, a)$ with techniques from **supervised learning** (SL). Recall that SL is useful for estimating conditional expectations by minimizing mean squared error. We can rephrase the estimation of $P^?_\hi(s' \mid s, a)$ as a least-squares problem as follows: Write $\delta_s$ to denote a one-hot vector in $\mathbb{R}^{|\mathcal{S}|}$, with a $1$ in the $s$-th entry and $0$ everywhere else. Note that $$\E_{s' \sim P_h(\cdot \mid s, a)} [\delta_{s'}] = P_h(\cdot \mid s, a) = \mu_h^\star \phi(s, a).$$ Furthermore, since the expectation here is linear with respect to $\phi(s, a)$, we can directly apply least-squares multi-target linear regression to construct the estimate $$\hat \mu = \argmin_{\mu \in \mathbb{R}^{|\mathcal{S}| \times d}} \sum_{t=0}^{T-1} \|\mu \phi(s_h^i, a_h^i) - \delta_{s_{h+1}^i} \|_2^2.$$ This has a well-known closed-form solution: $$\begin{aligned} +This linear assumption on the MDP will also allow us to model the unknown dynamics $P^?_\hi(s' \mid s, a)$ with techniques from **supervised learning** (SL). Recall that SL is useful for estimating conditional expectations by minimizing mean squared error. We can rephrase the estimation of $P^?_\hi(s' \mid s, a)$ as a least-squares problem as follows: Write $\delta_s$ to denote a one-hot vector in $\mathbb{R}^{|\mathcal{S}|}$, with a $1$ in the $s$-th entry and $0$ everywhere else. Note that + +$$\E_{s' \sim P_h(\cdot \mid s, a)} [\delta_{s'}] = P_h(\cdot \mid s, a) = \mu_h^\star \phi(s, a).$$ + +Furthermore, since the expectation here is linear with respect to $\phi(s, a)$, we can directly apply least-squares multi-target linear regression to construct the estimate + +$$\hat \mu = \arg\min_{\mu \in \mathbb{R}^{|\mathcal{S}| \times d}} \sum_{t=0}^{T-1} \|\mu \phi(s_h^i, a_h^i) - \delta_{s_{h+1}^i} \|_2^2.$$ + +This has a well-known closed-form solution: + +$$\begin{aligned} \hat \mu^\top & = (A_h^t)^{-1} \sum_{i=0}^{t-1} \phi(s_h^i, a_h^i) \delta_{s_{h+1}^i}^\top \\ \text{where} \quad A_h^t & = \sum_{i=0}^{t-1} \phi(s_h^i, a_h^i) \phi(s_h^i, a_h^i)^\top + \lambda I -\end{aligned}$$ where we include a $\lambda I$ term to ensure that the matrix $A^t_h$ is invertible. (This can also be derived by adding a $\lambda \|\mu\|_{\text{F}}^2$ regularization term to the objective.) We can directly plug in this estimate into $\hat{P}^t_h(\cdot \mid s, a) = \hat \mu^t_h \phi(s, a)$. +\end{aligned}$$ + +where we include a $\lambda I$ term to ensure that the matrix $A^t_h$ is invertible. (This can also be derived by adding a $\lambda \|\mu\|_{\text{F}}^2$ regularization term to the objective.) We can directly plug in this estimate into $\hat{P}^t_h(\cdot \mid s, a) = \hat \mu^t_h \phi(s, a)$. #### Reward bonus -Now, to design the reward bonus, we can't apply Hoeffding anymore, since the terms no longer involve sample means of bounded random variables; Instead, we're incorporating information across different states and actions. Rather, we can construct an upper bound using *Chebyshev's inequality* in the same way we did for the LinUCB algorithm in the MAB setting {ref}`lin_ucb`: $$b^t_\hi(s, a) = \beta \sqrt{\phi(s, a)^\top (A^t_h)^{-1} \phi(s, a)}, \quad \beta = \tilde O(d \hor).$$ Note that this isn't explicitly inversely proportional to $N_h^t(s, a)$ as in the original UCB-VI bonus term {prf:ref}`eq:ucb_vi_bonus`. Rather, it is inversely proportional to the amount that the direction $\phi(s, a)$ has been explored in the history. That is, if $A_h^t$ has a large component in the direction $\phi(s, a)$, implying that this direction is well explored, then the bonus term will be small, and vice versa. +Now, to design the reward bonus, we can't apply Hoeffding anymore, since the terms no longer involve sample means of bounded random variables; Instead, we're incorporating information across different states and actions. Rather, we can construct an upper bound using *Chebyshev's inequality* in the same way we did for the LinUCB algorithm in the MAB setting [](#lin_ucb): -We can now plug in these transition estimates and reward bonuses into the UCB-VI algorithm {prf:ref}`ucb_vi`. +$$b^t_\hi(s, a) = \beta \sqrt{\phi(s, a)^\top (A^t_h)^{-1} \phi(s, a)}, \quad \beta = \tilde O(d \hor).$$ + +Note that this isn't explicitly inversely proportional to $N_h^t(s, a)$ as in the original UCB-VI bonus term {prf:ref}`eq:ucb_vi_bonus`. Rather, it is inversely proportional to the amount that the direction $\phi(s, a)$ has been explored in the history. That is, if $A_h^t$ has a large component in the direction $\phi(s, a)$, implying that this direction is well explored, then the bonus term will be small, and vice versa. + +We can now plug in these transition estimates and reward bonuses into the UCB-VI algorithm [](#ucb-vi-alg). #### Performance :::{prf:theorem} LinUCB-VI regret :label: lin_ucb_vi_regret -The LinUCB-VI algorithm achieves expected regret $$\E[\text{Regret}_T] = \E\left[\sum_{t=0}^{T-1} V^\star_0(s_0) - V^{\pi^t}_0(s_0) \right] \le \tilde O(H^2 d^{1.5} \sqrt{T})$$ +The LinUCB-VI algorithm achieves expected regret + +$$\E[\text{Regret}_T] = \E\left[\sum_{t=0}^{T-1} V^\star_0(s_0) - V^{\pi^t}_0(s_0) \right] \le \tilde O(H^2 d^{1.5} \sqrt{T})$$ ::: Comparing this to our bound for UCB-VI in an environment without this linear assumption, we see that we go from a sample complexity of $\tilde \Omega(H^4 |\mathcal{S}||\mathcal{A}|)$ to $\tilde \Omega(H^4 d^{3})$. This new sample complexity only depends on the feature dimension and not on the state or action space of the MDP! @@ -275,8 +352,8 @@ In this chapter, we've explored how to explore in an unknown MDP. - We first discussed the explore-then-exploit algorithm {prf:ref}`explore_then_exploit`, a simple way to explore a deterministic MDP by visiting all state-action pairs. -- We then discussed how to treat an unknown MDP as a MAB {ref}`mdp_mab`, and how this approach is inefficient since it doesn't make use of relationships between policies. +- We then discussed how to treat an unknown MDP as a MAB [](#mdp_mab), and how this approach is inefficient since it doesn't make use of relationships between policies. -- We then introduced the UCB-VI algorithm {prf:ref}`ucb_vi`, which models the unknown MDP by a proxy MDP with a reward bonus term that encourages exploration. +- We then introduced the UCB-VI algorithm [](#ucb-vi-alg), which models the unknown MDP by a proxy MDP with a reward bonus term that encourages exploration. -- Finally, assuming that the transitions and rewards are linear with respect to a feature transformation of the state and action, we introduced the LinUCB-VI algorithm {ref}`lin_ucb_vi`, which has a sample complexity independent of the size of the state and action spaces. +- Finally, assuming that the transitions and rewards are linear with respect to a feature transformation of the state and action, we introduced the LinUCB-VI algorithm [](#lin_ucb_vi), which has a sample complexity independent of the size of the state and action spaces. diff --git a/book/fitted_dp.md b/book/fitted_dp.md index 1d79d1c..0aa3f05 100644 --- a/book/fitted_dp.md +++ b/book/fitted_dp.md @@ -11,16 +11,15 @@ kernelspec: name: python3 --- -(fitted_dp)= # Fitted Dynamic Programming Algorithms -```{contents} -:local: -``` +## Introduction -We borrow these definitions from the {ref}`mdps` chapter: +We borrow these definitions from the [](./mdps.md) chapter: ```{code-cell} +:tags: [hide-input] + from typing import NamedTuple, Callable, Optional from jaxtyping import Float, Array import jax.numpy as np @@ -67,9 +66,7 @@ def q_to_greedy(Q: QFunction) -> Policy: return lambda s, h: np.argmax(Q(s, h)) ``` -## Introduction - -The {ref}`mdps` chapter discussed the case of **finite** MDPs, where the state and action spaces $\mathcal{S}$ and $\mathcal{A}$ were finite. +The [](./mdps.md) chapter discussed the case of **finite** MDPs, where the state and action spaces $\mathcal{S}$ and $\mathcal{A}$ were finite. This gave us a closed-form expression for computing the r.h.s. of {prf:ref}`the Bellman one-step consistency equation `. In this chapter, we consider the case of **large** or **continuous** state spaces, where the state space is too large to be enumerated. In this case, we need to *approximate* the value function and Q-function using methods from **supervised learning**. @@ -136,7 +133,7 @@ $$ \hat f = \arg\min_{f \in \mathcal{F}} \frac{1}{N} \sum_{i=1}^N (y_i - f(x_i))^2 $$ -We will cover the details of the minimization process in {ref}`the next section `. +We will cover the details of the minimization process in [](#the next section ). ::: :::{attention} @@ -151,9 +148,9 @@ Let us apply ERM to the RL problem of computing the optimal policy / value funct How did we compute the optimal value function in MDPs with _finite_ state and action spaces? -- In a {ref}`finite-horizon MDP `, we can use {prf:ref}`dynamic programming `, working backwards from the end of the time horizon, to compute the optimal value function exactly. +- In a [](#finite-horizon MDP ), we can use {prf:ref}`dynamic programming `, working backwards from the end of the time horizon, to compute the optimal value function exactly. -- In an {ref}`infinite-horizon MDP `, we can use {ref}`value iteration `, which iterates the Bellman optimality operator {eq}`bellman_optimality_operator` to approximately compute the optimal value function. +- In an [](#infinite-horizon MDP ), we can use [](#value iteration ), which iterates the Bellman optimality operator {eq}`bellman_optimality_operator` to approximately compute the optimal value function. Our existing approaches represent the value function, and the MDP itself, in matrix notation. @@ -213,7 +210,7 @@ $$ f(x) = \E [y \mid x] \quad \text{where} \quad y = r(s_\hi, a_\hi) + \max_{a'} Q^\star_{\hi + 1}(s', a'). $$ -Approximating the conditional expectation is precisely the task that [empirical risk minimization](erm) is suited for! +Approximating the conditional expectation is precisely the task that [](#erm) is suited for! Our above dataset would give us $N \cdot \hor$ samples in the dataset: @@ -272,11 +269,11 @@ FittingMethod = Callable[[Float[Array, "N D"], Float[Array, " N"]], QFunction] But notice that the definition of $y_{i \hi}$ depends on the Q-function itself! How can we resolve this circular dependency? -Recall that we faced the same issue [when evaluating a policy in an infinite-horizon MDP](iterative_pe). There, we iterated the {prf:ref}`Bellman operator ` since we knew that the policy's value function was a fixed point of the policy's Bellman operator. +Recall that we faced the same issue [when evaluating a policy in an infinite-horizon MDP](#iterative_pe). There, we iterated the [](#bellman_operator) since we knew that the policy's value function was a fixed point of the policy's Bellman operator. We can apply the same strategy here, using the $\hat f$ from the previous iteration to compute the labels $y_{i \hi}$, and then using this new dataset to fit the next iterate. -:::{prf:algorithm} Fitted Q-function iteration +:::{prf:definition} Fitted Q-function iteration :label: fitted_q_iteration 1. Initialize some function $\hat f(s, a, h) \in \mathbb{R}$. @@ -308,7 +305,7 @@ def fitted_q_iteration( We can also use this fixed-point interation to *evaluate* a policy using the dataset (not necessarily the one used to generate the trajectories): -:::{prf:algorithm} Fitted policy evaluation +:::{prf:definition} Fitted policy evaluation :label: fitted_evaluation **Input:** Policy $\pi : \mathcal{S} \times [H] \to \Delta(\mathcal{A})$ to be evaluated. @@ -348,7 +345,7 @@ Spot the difference between `fitted_evaluation` and `fitted_q_iteration`. (See t How would you modify this algorithm to evaluate the data collection policy? ::: -We can use this policy evaluation algorithm to adapt the {ref}`policy iteration algorithm ` to this new setting. The algorithm remains exactly the same -- repeatedly make the policy greedy w.r.t. its own value function -- except now we must evaluate the policy (i.e. compute its value function) using the iterative `fitted_evaluation` algorithm. +We can use this policy evaluation algorithm to adapt the [](#policy iteration algorithm ) to this new setting. The algorithm remains exactly the same -- repeatedly make the policy greedy w.r.t. its own value function -- except now we must evaluate the policy (i.e. compute its value function) using the iterative `fitted_evaluation` algorithm. ```{code-cell} def fitted_policy_iteration( @@ -367,3 +364,5 @@ def fitted_policy_iteration( ``` ## Summary + + diff --git a/book/imitation_learning.md b/book/imitation_learning.md index e0176e6..c7de3c1 100644 --- a/book/imitation_learning.md +++ b/book/imitation_learning.md @@ -11,9 +11,10 @@ kernelspec: name: python3 --- -(imitation_learning)= # Imitation Learning +## Introduction + Imagine you are tasked with learning how to drive. How do, or did, you go about it? At first, this task might seem insurmountable: there are a vast array of controls, and the cost of making a single mistake could be extremely high, making it hard to explore by trial and error. Luckily, there are already people in the world who know how to drive who can get you started. @@ -22,21 +23,24 @@ In this and many other examples, we all "stand on the shoulders of giants" and l Now in machine learning, much of the time, we are trying to teach machines to accomplish tasks that us humans are already proficient at. In such cases, the machine learning algorithm is the one learning the new skill, and humans are the "experts" that can demonstrate how to perform the task. **Imitation learning** is a direct application of this idea to machine learning for interactive tasks. -We'll see that the most naive form of imitation learning, called **behavioural cloning**, is really an application of supervised learning to interactive tasks. +We'll see that the most naive form of imitation learning, called **behavioral cloning**, is really an application of supervised learning to interactive tasks. We'll then explore **dataset aggregation** (DAgger) as a way to query an expert and learn even more effectively. -## Behavioural cloning +## Behavioral cloning -This notion of "learning from human-provided data" may remind you of the basic premise of {ref}`supervised_learning`, +This notion of "learning from human-provided data" may remind you of the basic premise of [](./supervised_learning.md), in which there is some mapping from _inputs_ to _outputs_ that us humans can implicitly compute, such as seeing a photo and being able to recognize its constituents. To teach a machine to calculate this mapping, we first collect a large _training dataset_ by getting people to label a lot of inputs, and then use some optimization algorithm to produce a predictor that maps from the inputs to the outputs as closely as possible. How does this relate to interactive tasks? Here, the input is the observation seen by the agent and the output is the action it selects, so the mapping is the agent's policy. What's stopping us from applying supervised learning techniques? -In practice, nothing! This is called **behavioural cloning.** +In practice, nothing! This is called **behavioral cloning.** + +:::{prf:definition} Behavioral cloning +:label: behavioral_cloning + -:::{prf:algorithm} 1. Collect a training dataset of trajectories generated by an expert policy $\pi_\text{data}$. Here, we treat each state-action pair as independent, resuling in a dataset $\mathcal{D} = (s^n, a^n)_{n=1}^{N}$. (For concreteness, if there are $M$ trajectories with a horizon $H$, then $N = M \times H$.) - Note that this is an inaccurate approximation! A key property of interactive tasks is that the agent's output -- the action that it takes -- may influence its next observation. 2. Use a SL algorithm $\texttt{fit} : \mathcal{D} \mapsto \tilde \pi$ to extract a policy $\tilde \pi$ that approximates the expert policy. @@ -54,7 +58,7 @@ More generally, though, we often choose the **negative log likelihood** as our l out of the space of all possible mappings, we search for the one according to which the training dataset is the most likely. :::{math} -\tilde \pi = \arg\max_{\pi \in \Pi} \Pr_{a^n \sim \pi(s^n)}(a^{0:N} \mid s^{0:N}) +\tilde \pi = \arg\max_{\pi \in \Pi} \pr_{a^n \sim \pi(s^n)}(a^{0:N} \mid s^{0:N}) ::: Can we quantify how well this algorithm works? @@ -76,7 +80,7 @@ Then, their value functions differ by where $H$ is the horizon. -:::{prf:theorem} Performance of behavioural cloning +:::{prf:theorem} Performance of behavioral cloning Recall the {prf:ref}`pdl` allows us to express the difference between $\pi_{\text{data}}$ and $\tilde \pi$ as diff --git a/book/index.md b/book/index.md index b712b13..ad9d06f 100644 --- a/book/index.md +++ b/book/index.md @@ -1,7 +1,20 @@ +--- +jupytext: + text_representation: + extension: .md + format_name: myst + format_version: 0.13 + jupytext_version: 1.16.2 +kernelspec: + display_name: Python 3 (ipykernel) + language: python + name: python3 +--- + # Introduction Welcome to the study of reinforcement learning! -This textbook accompanies the undergraduate course [CS 1840/STAT 184](http://lucasjanson.fas.harvard.edu/CS_Stat_184_0.html) taught at Harvard. +This textbook accompanies the undergraduate course [CS 1840/STAT 184](http://lucasjanson.fas.harvard.edu/courses/CS_Stat_184_0.html) taught at Harvard. It is intended to be a friendly yet rigorous introduction to this active subfield of machine learning. +++ @@ -11,7 +24,7 @@ It is intended to be a friendly yet rigorous introduction to this active subfiel This book assumes the same prerequisites as the course: You should be familiar with multivariable calculus, linear algebra, and probability. For Harvard undergraduates, this is fulfilled by Math 21a, Math 21b, and Stat 110, or their equivalents. Stat 111 is strongly recommended but not required. -Specifically, we will assume that you know the following topics. The _italicized terms_ have brief re-introductions in the text or in the {ref}`background`: +Specifically, we will assume that you know the following topics. The _italicized terms_ have brief re-introductions in the text or in the [](./background.md): - **Linear Algebra:** Vectors and matrices, matrix multiplication, matrix inversion, eigenvalues and eigenvectors. @@ -20,7 +33,7 @@ Specifically, we will assume that you know the following topics. The _italicized expectation and variance, the law of iterated expectations (Adam's rule), covariance, conditional probability, Bayes's rule, and the law of total probability. You should also be comfortable with programming in Python. -See {ref}`programming` for more about this textbook's philosophy regarding programming. +See [](#programming) for more about this textbook's philosophy regarding programming. +++ @@ -28,13 +41,15 @@ See {ref}`programming` for more about this textbook's philosophy regarding progr Broadly speaking, RL studies **sequential decision-making** in **dynamic environments.** -An RL algorithm finds a **policy,** or strategy, that maximizes the **reward** it obtains from the environment. +An RL algorithm finds a strategy, called a **policy,** that maximizes the **reward** it obtains from the environment. RL provides a powerful framework for attacking a wide variety of problems, including robotic control, video games and board games, resource management, language modelling, and more. It also provides an interdisciplinary paradigm for studying animal and human behavior. Many of the most stunning results in machine learning, ranging from AlphaGo to ChatGPT, are built using RL algorithms. ++++ + How does RL compare to the other two core machine learning paradigms, **supervised learning** and **unsupervised learning?** @@ -53,7 +68,7 @@ How does RL compare to the other two core machine learning paradigms, +++ -## Core tasks in reinforcement learning +## Core tasks of reinforcement learning What tasks, exactly, does RL comprise? An RL algorithm must typically solve two main subtasks: @@ -76,48 +91,46 @@ An RL algorithm must typically solve two main subtasks: The course will progress through the following units: -{ref}`mdps` introduces **Markov Decision Processes,** +[](./mdps.md) introduces **Markov Decision Processes,** the core mathematical framework for describing a large class of interactive environments. -{ref}`lqr` is a standalone chapter on the **linear quadratic regulator** (LQR), +[](./control.md) is a standalone chapter on the **linear quadratic regulator** (LQR), an important tool for *continuous control*, in which the state and action spaces are no longer _finite_ but rather _continuous_. This has widespread applications in robotics. -{ref}`bandits` introduces the **multi-armed bandit** (MAB) model for _stateless_ sequential decision-making tasks. +[](./bandits.md) introduces the **multi-armed bandit** (MAB) model for _stateless_ sequential decision-making tasks. In exploring a number of algorithms, we will see how each of them strikes a different balance between _exploring_ new options and _exploiting_ known options. This **exploration-exploitation tradeoff** is a core consideration in RL algorithm design. -{ref}`supervised_learning` is a standalone crash course on some tools from supervised learning that we will use in later chapters. +[](./supervised_learning.md) is a standalone crash course on some tools from supervised learning that we will use in later chapters. -{ref}`fitted_dp` introduces **fitted dynamic programming** (fitted DP) algorithms for solving MDPs. +[](./fitted_dp.md) introduces **fitted dynamic programming** (fitted DP) algorithms for solving MDPs. These algorithms use supervised learning to approximately evaluate policies when they cannot be evaluated exactly. -{ref}`pg` explores an important class of algorithms based on iteratively improving a policy. +[](./pg.md) explores an important class of algorithms based on iteratively improving a policy. We will also encounter the use of _deep neural networks_ to express more complicated policies and approximate complicated functions. -{ref}`imitation_learning` attempts to learn a good policy from expert demonstrations. +[](./imitation_learning.md) attempts to learn a good policy from expert demonstrations. At its most basic, this is an application of supervised learning to RL tasks. -{ref}`planning` looks at ways to _explicitly_ plan ahead when the environment's dynamics are known. +[](./planning.md) looks at ways to _explicitly_ plan ahead when the environment's dynamics are known. We will study the _Monte Carlo Tree Search_ heuristic, which has been used to great success in the famous AlphaGo algorithm and its successors. -{ref}`exploration` continues to investigate the exploration-exploitation tradeoff. +[](./exploration.md) continues to investigate the exploration-exploitation tradeoff. We will extend ideas from multi-armed bandits to the MDP setting. -{ref}`contextual_bandits` extends the multi-armed bandit setting with some observed state. - -{ref}`background` contains an overview of selected background mathematical content and programming content. +[](./background.md) contains an overview of selected background mathematical content and programming content. +++ @@ -175,3 +188,30 @@ It uses the [JAX](https://jax.readthedocs.io/en/latest/index.html) library for n JAX was chosen for the clarity of its functional style and due to its mature RL ecosystem, sustained in large part by the Google DeepMind research group and a large body of open-source contributors. We use the standard [Gymnasium](https://gymnasium.farama.org/) library for interfacing with RL environments. + +The following names are exported from the `utils` module: + +```python +import matplotlib.pyplot as plt + +# convenient class builder +from typing import NamedTuple + +# function typings +from collections.abc import Callable + +# array typings +from jaxtyping import Float, Array + +# convenient function composition +from functools import partial + +# numerical computing and linear algebra +import jax +import jax.numpy as jnp + +# print functions as latex +import latexify + +plt.style.use("fivethirtyeight") +``` diff --git a/book/mdps.md b/book/mdps.md index 300f407..d4489eb 100644 --- a/book/mdps.md +++ b/book/mdps.md @@ -11,22 +11,13 @@ kernelspec: name: python3 --- -(mdps)= # Markov Decision Processes ## Introduction -```{code-cell} -:tags: [hide-input] - -from typing import NamedTuple -from jaxtyping import Float, Array -import jax.numpy as jnp -from jax import vmap -from functools import partial -``` - -The field of RL studies how an agent can learn to make sequential decisions in an interactive environment. This is a very general problem! How can we *formalize* this task in a way that is both *sufficiently general* yet also tractable enough for *fruitful analysis*? +The field of RL studies how an agent can learn to make sequential decisions in an interactive environment. +This is a very general problem! +How can we *formalize* this task in a way that is both *sufficiently general* yet also tractable enough for *fruitful analysis*? Let’s consider some examples of sequential decision problems to identify the key common properties we’d like to capture: @@ -50,7 +41,7 @@ An interactive environment satisfies the **Markov property** if the probability of transitioning to a new state only depends on the current state and action: -$$\P(s_{\hi+1} \mid s_0, a_0, \dots, s_\hi, a_\hi) = P(s_{\hi+1} \mid s_\hi, a_\hi)$$ +$$\pr(s_{\hi+1} \mid s_0, a_0, \dots, s_\hi, a_\hi) = P(s_{\hi+1} \mid s_\hi, a_\hi)$$ where $P : \mathcal{S} \times \mathcal{A} \to \triangle(\mathcal{S})$ describes the state transitions. (We’ll elaborate on this notation later in the chapter.) @@ -73,6 +64,10 @@ We’ll describe how to _evaluate_ different strategies, called **policies,** an the **optimal policy** for a given MDP. We’ll introduce the **Bellman consistency condition**, which allows us to analyze the whole sequence of interactions in terms of individual timesteps. +```{code-cell} ipython3 +from utils import NamedTuple, Float, Array, partial, jax, jnp, latexify +``` + ## Finite-horizon MDPs ### Definition @@ -125,7 +120,7 @@ $$ Verify that the types and shapes provided above make sense! ::: -```{code-cell} +```{code-cell} ipython3 class MDP(NamedTuple): """A description of a Markov decision process with finitely many states and actions.""" S: int # number of states @@ -170,42 +165,38 @@ Consider a time horizon of $\hor = 7$ days (one interaction per day). Let $t = 0$ correspond to Monday and $t = 6$ correspond to Sunday. ::: -```{code-cell} +```{code-cell} ipython3 tidy_mdp = MDP( S=2, # 0 = orderly, 1 = messy A=2, # 0 = ignore, 1 = tidy μ=jnp.array([1.0, 0.0]), # start in orderly state - P=jnp.array( + P=jnp.array([ [ - [ - [0.7, 0.3], # orderly, ignore - [1.0, 0.0], # orderly, tidy - ], - [ - [0.0, 1.0], # messy, ignore - [1.0, 0.0], # messy, tidy - ], - ] - ), - r=jnp.array( + [0.7, 0.3], # orderly, ignore + [1.0, 0.0], # orderly, tidy + ], [ - [ - 1.0, # orderly, ignore - -1.0, # orderly, tidy - ], - [ - -1.0, # messy, ignore - 0.0, # messy, tidy - ], + [0.0, 1.0], # messy, ignore + [1.0, 0.0], # messy, tidy + ], + ]), + r=jnp.array([ + [ + 1.0, # orderly, ignore + -1.0, # orderly, tidy + ], + [ + -1.0, # messy, ignore + 0.0, # messy, tidy ] - ), + ]), H=7, ) ``` ### Policies -:::{prf:definition} Policies +::::{prf:definition} Policies :label: policy A **policy** $\pi$ describes the agent's strategy: @@ -219,6 +210,18 @@ inputs, and time-dependence. actions while a stochastic policy outputs *distributions* over actions. +:::{figure} ./shared/deterministic_policy.png +:align: center + +A deterministic policy. +::: + +:::{figure} ./shared/stochastic_policy.png +:align: center + +A stochastic policy. +::: + 2. **State-dependent or history-dependent.** A state-dependent (a.k.a. "Markovian") policy only depends on the current state, while a history-dependent policy depends on the sequence of past states, @@ -229,7 +232,9 @@ inputs, and time-dependence. remains the same function at all time steps, while a time-dependent policy can depend on the current timestep. For consistency with states and actions, we will denote the timestep as a subscript, i.e. $\pi = \{ \pi_0, \dots, \pi_{\hor-1} \}.$ -::: +:::: + ++++ Note that for finite state and action spaces, we can represent a randomized mapping $\mathcal{S} \to \Delta(\mathcal{A})$ @@ -254,13 +259,32 @@ Here are some possible policies for the tidying MDP {prf:ref}`tidy_mdp`: and $\pi_\hi(\text{orderly}) = \text{ignore}$ for all $\hi$. ::: -```{code-cell} +```{code-cell} ipython3 # arrays of shape (H, S, A) represent time-dependent policies -tidy_policy_always_tidy = jnp.zeros((7, 2, 2)).at[:, :, 1].set(1.0) -tidy_policy_weekends = jnp.zeros((7, 2, 2)).at[5:7, :, 1].set(1.0).at[0:5, :, 0].set(1.0) -tidy_policy_messy_only = jnp.zeros((7, 2, 2)).at[:, 1, 1].set(1.0).at[:, 0, 0].set(1.0) +tidy_policy_always_tidy = ( + jnp.zeros((7, 2, 2)) + .at[:, :, 1].set(1.0) +) +tidy_policy_weekends = ( + jnp.zeros((7, 2, 2)) + .at[5:7, :, 1].set(1.0) + .at[0:5, :, 0].set(1.0) +) +tidy_policy_messy_only = ( + jnp.zeros((7, 2, 2)) + .at[:, 1, 1].set(1.0) + .at[:, 0, 0].set(1.0) +) ``` +:::{note} +Array objects in Jax are **immutable,** that is, they cannot be _changed._ +This might seem inconvenient, but in larger projects, +immutability makes code much easier to reason about. +::: + ++++ + (trajectories)= ### Trajectories @@ -275,9 +299,12 @@ where $r_\hi = r(s_\hi, a_\hi)$. (Note that some sources omit the reward at the final time step. This is a minor detail.) ::: -```{code-cell} +```{code-cell} ipython3 class Transition(NamedTuple): - """A single state-action-reward interaction with the environment.""" + """A single state-action-reward interaction with the environment. + + A trajectory comprises a sequence of transitions. + """ s: int a: int r: float @@ -295,19 +322,21 @@ transitioning according to the state transitions, and observing the rewards. That is, a policy induces a distribution $\rho^{\pi}$ over trajectories. (We assume that $\mu$ and $P$ are clear from context.) -:::{prf:example} Trajectories in the tidying environment +::::{prf:example} Trajectories in the tidying environment :label: tidy_traj Here is a possible trajectory for the tidying example: -| $\hi$ | $0$ | $1$ | $2$ | $3$ | $4$ | $5$ | $6$ | -|:-----:|:-------:|:-------:|:-------:|:------:|:-----:|:-------:|:-------:| -| $s$ | orderly | orderly | orderly | messy | messy | orderly | orderly | -| $a$ | tidy | ignore | ignore | ignore | tidy | ignore | ignore | -| $r$ | $-1$ | $1$ | $1$ | $-1$ | $0$ | $1$ | $1$ | +:::{table} +| $\hi$ | $0$ | $1$ | $2$ | $3$ | $4$ | $5$ | $6$ | +| :-----: | :-------: | :-------: | :-------: | :------: | :-----: | :-------: | :-------: | +| $s$ | orderly | orderly | orderly | messy | messy | orderly | orderly | +| $a$ | tidy | ignore | ignore | ignore | tidy | ignore | ignore | +| $r$ | $-1$ | $1$ | $1$ | $-1$ | $0$ | $1$ | $1$ | +::: Could any of the policies in {prf:ref}`tidy_policy` have generated this trajectory? -::: +:::: Note that for a state-dependent policy, using the Markov property {prf:ref}`markov`, we can write down the likelihood function of this probability distribution in an **autoregressive** way (i.e. one timestep at a time): @@ -318,18 +347,23 @@ we can write down the likelihood function of this probability distribution in an $$\rho^{\pi}(\tau) := \mu(s_0) \pi_0(a_0 \mid s_0) P(s_1 \mid s_0, a_0) \cdots P(s_{\hor-1} \mid s_{\hor-2}, a_{\hor-2}) \pi_{\hor-1}(a_{\hor-1} \mid s_{\hor-1})$$ ::: -```{code-cell} +```{code-cell} ipython3 def trajectory_log_likelihood( mdp: MDP, τ: list[Transition], π: Float[Array, "S A"], ) -> float: """Compute the log-likelihood of a trajectory under a given MDP and policy.""" + + # initial distribution and action total = jnp.log(mdp.μ[τ[0].s]) total += jnp.log(π[τ[0].s, τ[0].a]) + + # remaining state transitions and actions for i in range(1, mdp.H): total += jnp.log(mdp.P[τ[i - 1].s, τ[i - 1].a, τ[i].s]) total += jnp.log(π[τ[i].s, τ[i].a]) + return total ``` @@ -373,12 +407,16 @@ Similarly, we can define the **action-value function** (aka the $$Q_\hi^\pi(s, a) := \E_{\tau \sim \rho^\pi} [r_\hi + \cdots + r_{H-1} \mid s_\hi = s, a_\hi = a]$$ ::: ++++ + +#### Relating the value function and action-value function + Note that the value function is just the expected action-value over actions drawn from the policy: $$V_\hi^\pi(s) = \E_{a \sim \pi_\hi(s)} [Q_\hi^\pi(s, a)]$$ -```{code-cell} +```{code-cell} ipython3 def q_to_v( policy: Float[Array, "S A"], q: Float[Array, "S A"], @@ -387,37 +425,40 @@ def q_to_v( Compute the value function for a given policy in a known finite MDP at a single timestep from its action-value function. """ - return jnp.sum(policy * q, axis=1) + return jnp.average(q, weights=policy, axis=1) ``` -and the -action-value can be expressed in terms of the value of the following +and the action-value is the sum of the immediate reward and the expected value of the following state: $$Q_\hi^\pi(s, a) = r(s, a) + \E_{s' \sim P(s, a)} [V_{\hi+1}^\pi(s')]$$ -```{code-cell} +```{code-cell} ipython3 def v_to_q( mdp: MDP, - v: Float[Array, " S"], + v_next: Float[Array, " S"], ) -> Float[Array, "S A"]: """ Compute the action-value function in a known finite MDP at a single timestep from the corresponding value function. """ # the discount factor is relevant later - return mdp.r + mdp.γ * mdp.P @ v + return mdp.r + mdp.γ * mdp.P @ v_next # convert a list of v functions to a list of q functions -v_ary_to_q_ary = vmap(v_to_q, in_axes=(None, 0)) +v_ary_to_q_ary = jax.vmap(v_to_q, in_axes=(None, 0)) ``` #### Greedy policies -For any given $Q \in \mathbb{R}^{|\mathcal{S}| \times |\mathcal{A}|}$, we can define the **greedy policy** $\hat \pi_Q$ as the policy that selects the action with the highest $Q$-value at each state: +For any given $Q \in \mathbb{R}^{|\mathcal{S}| \times |\mathcal{A}|}$, we can define the **greedy policy** $\hat \pi_Q$ as the deterministic policy that selects the action with the highest $Q$-value at each state: + +$$ +\hat \pi_Q(s) = \arg\max_{a} Q_{sa} +$$ -```{code-cell} +```{code-cell} ipython3 def q_to_greedy(q: Float[Array, "S A"]) -> Float[Array, "S A"]: """ Get the (deterministic) greedy policy w.r.t. an action-value function. @@ -433,7 +474,6 @@ def v_to_greedy(mdp: MDP, v: Float[Array, " S"]) -> Float[Array, "S A"]: return q_to_greedy(v_to_q(mdp, v)) ``` -(bellman_consistency)= ### The one-step (Bellman) consistency equation Note that by simply considering the cumulative reward as the sum of the @@ -450,7 +490,7 @@ V_\hi^\pi(s) = \E_{\substack{a \sim \pi_\hi(s) \\ s' \sim P(s, a)}} [r(s, a) + V $$ ::: -```{code-cell} +```{code-cell} ipython3 def check_bellman_consistency_v( mdp: MDP, policy: Float[Array, "H S A"], @@ -503,7 +543,8 @@ $$ $$ ::: -(bellman_operator)= ++++ + ### The one-step Bellman operator Fix a policy $\pi$. Consider the higher-order operator that takes in a @@ -514,9 +555,14 @@ equation for that "value function": :label: bellman_operator $$[\mathcal{J}^{\pi}(v)](s) := \E_{\substack{a \sim \pi(s) \\ s' \sim P(s, a)}} [r(s, a) + v(s')].$$ + +This is a crucial tool for reasoning about MDPs. +Intuitively, it answers the following question: +if we evaluate the _next_ state using $v$, +how good is the _current_ state, according to the given policy? ::: -```{code-cell} +```{code-cell} ipython3 :tags: [hide-input] def bellman_operator_looping( @@ -540,23 +586,24 @@ def bellman_operator_looping( return v_new ``` -```{code-cell} +Note that we can concisely implement this using the `q_to_v` and `v_to_q` utilities from above: + +```{code-cell} ipython3 def bellman_operator( mdp: MDP, policy: Float[Array, "S A"], v: Float[Array, " S"], ) -> Float[Array, " S"]: """For a known finite MDP, the Bellman operator can be exactly evaluated.""" - return jnp.sum(policy * (mdp.r + mdp.γ * mdp.P @ v), axis=1) return q_to_v(policy, v_to_q(mdp, v)) # equivalent + return jnp.sum(policy * (mdp.r + mdp.γ * mdp.P @ v), axis=1) ``` -We'll call $\mathcal{J}^\pi : (\mathcal{S} \to \mathbb{R}) \to (\mathcal{S} \to \mathbb{R})$ the **Bellman -operator** of $\pi$. Note that it's defined on any "value function" -mapping states to real numbers; $v$ doesn't have to be a well-defined -value function for some policy (hence the lowercase notation). The -Bellman operator also gives us a concise way to express the Bellman -consistency equation {prf:ref}`bellman_consistency` for the value function: +We'll call $\mathcal{J}^\pi : \mathbb{R}^\mathcal{S} \to \mathbb{R}^\mathcal{S}$ the **Bellman +operator** of $\pi$. +Note that it's defined on any "value function" mapping states to real numbers; +$v$ doesn't have to be a well-defined value function for some policy (hence the lowercase notation). +The Bellman operator also gives us a concise way to express {prf:ref}`bellman_consistency` for the value function: $$V_\hi^\pi = \mathcal{J}^{\pi}(V_{\hi+1}^\pi)$$ @@ -579,7 +626,7 @@ construct algorithms for computing the optimal policy. How can we actually compute the value function of a given policy? This is the task of **policy evaluation**. -:::{prf:algorithm} DP algorithm to evaluate a policy in a finite-horizon MDP +:::{prf:definition} DP algorithm to evaluate a policy in a finite-horizon MDP The Bellman consistency equation {prf:ref}`bellman_consistency` @@ -591,7 +638,7 @@ known, and work backwards in time, using the Bellman consistency equation to compute the value function at each time step. ::: -```{code-cell} +```{code-cell} ipython3 def dp_eval_finite(mdp: MDP, policy: Float[Array, "S A"]) -> Float[Array, "H S"]: """Evaluate a policy using dynamic programming.""" V_ary = [None] * mdp.H + [jnp.zeros(mdp.S)] # initialize to 0 at end of time horizon @@ -644,7 +691,7 @@ etc. You may wish to repeat this computation for the other policies to get a better sense of this algorithm. ::: -```{code-cell} +```{code-cell} ipython3 V_messy = dp_eval_finite(tidy_mdp, tidy_policy_messy_only) V_messy ``` @@ -691,7 +738,7 @@ action-value function: $$\pi_\hi^\star(s) = \arg\max_a Q_\hi^\star(s, a).$$ ::: -::::{dropdown} Proof +::::{prf:proof} Proof Let $V^{\star}$ and $Q^{\star}$ denote the optimal value and action-value functions. Consider the greedy policy @@ -755,7 +802,7 @@ $$ And so we have $V^{\star} = V^{\hat \pi}$, making $\hat \pi$ optimal. :::: -Note that this also gives simplified forms of the [Bellman consistency](bellman_consistency) equations for the optimal policy: +Note that this also gives simplified forms of the [Bellman consistency](#bellman_consistency) equations for the optimal policy: ::::{prf:corollary} Bellman consistency equations for the optimal policy :label: bellman_consistency_optimal @@ -773,7 +820,7 @@ need to do is compute the optimal value function and optimal policy. We can do this by working backwards in time using **dynamic programming** (DP). -:::{prf:algorithm} DP algorithm to compute an optimal policy in a finite-horizon MDP +:::{prf:definition} DP algorithm to compute an optimal policy in a finite-horizon MDP :label: pi_star_dp **Base case.** At the end of the episode (time step $H-1$), we can't @@ -806,7 +853,7 @@ $$ $$ ::: -```{code-cell} +```{code-cell} ipython3 def find_optimal_policy(mdp: MDP): Q = [None] * mdp.H pi = [None] * mdp.H @@ -830,13 +877,13 @@ operations to evaluate the average value over $s'$. This gives a total computation time of $O(H \cdot |\mathcal{S}|^2 \cdot |\mathcal{A}|)$. Note that this algorithm is identical to the policy evaluation algorithm -[`dp_eval_finite`](eval_dp), but instead of *averaging* over the +[`dp_eval_finite`](#eval_dp), but instead of *averaging* over the actions chosen by a policy, we instead simply take a *maximum* over the action-values. We'll see this relationship between **policy evaluation** and **optimal policy computation** show up again in the infinite-horizon setting. -```{code-cell} +```{code-cell} ipython3 π_opt, V_opt, Q_opt = find_optimal_policy(tidy_mdp) assert jnp.allclose(π_opt, tidy_policy_messy_only) assert jnp.allclose(V_opt, V_messy) @@ -852,8 +899,9 @@ $H = \infty$)? This is the setting of **infinite horizon** MDPs. In this chapter, we'll describe the necessary adjustments from the finite-horizon case to make the problem tractable. We'll show that the -[Bellman operator](bellman_operator) in the discounted reward setting is a -**contraction mapping** for any policy. We'll discuss how to evaluate +[Bellman operator](#bellman_operator) in the discounted reward setting is a +**contraction mapping** for any policy. +We'll discuss how to evaluate policies (i.e. compute their corresponding value functions). Finally, we'll present and analyze two iterative algorithms, based on the Bellman operator, for computing the optimal policy: **value iteration** and @@ -893,7 +941,7 @@ $$M = (\mathcal{S}, \mathcal{A}, \mu, P, r, \gamma).$$ Code-wise, we can reuse the `MDP` class from before {prf:ref}`finite_horizon_mdp` and set `mdp.H = float('inf')`. -```{code-cell} +```{code-cell} ipython3 tidy_mdp_inf = tidy_mdp._replace(H=float("inf"), γ=0.95) ``` @@ -935,7 +983,7 @@ time step we condition on when defining the value function? ### The Bellman operator is a contraction mapping -Recall from [](bellman_operator) that the Bellman operator $\mathcal{J}^{\pi}$ +Recall from [](#bellman_operator) that the Bellman operator $\mathcal{J}^{\pi}$ for a policy $\pi$ takes in a "value function" $v : \mathcal{S} \to \mathbb{R}$ and returns the r.h.s. of the Bellman equation for that "value function". In the infinite-horizon setting, this is @@ -1011,7 +1059,7 @@ $$ $$ ::: -:::{dropdown} Proof of {prf:ref}`bellman_contraction` +:::{prf:proof} Proof of {prf:ref}`bellman_contraction` For all states $s \in \mathcal{S}$, @@ -1029,7 +1077,7 @@ $$ ### Policy evaluation in infinite-horizon MDPs -The backwards DP technique we used in [the finite-horizon case](eval_dp) no +The backwards DP technique we used in [the finite-horizon case](#eval_dp) no longer works since there is no "final timestep" to start from. We'll need another approach to policy evaluation. @@ -1096,7 +1144,7 @@ is invertible because it maps any nonzero vector to a vector with at least one nonzero element.) ::: -```{code-cell} +```{code-cell} ipython3 def eval_deterministic_infinite( mdp: MDP, policy: Float[Array, "S A"] ) -> Float[Array, " S"]: @@ -1128,7 +1176,7 @@ $1/(1-\gamma) = 20$. We see that the value function is indeed slightly lower than this. ::: -```{code-cell} +```{code-cell} ipython3 eval_deterministic_infinite(tidy_mdp_inf, tidy_policy_messy_only[0]) ``` @@ -1150,7 +1198,7 @@ $$v^{(t+1)} = \mathcal{J}^{\pi}(v^{(t)}),$$ i.e. $v^{(t)} = (\mathcal{J}^{\pi})^{(t)} (v^{(0)})$. Note that each iteration takes $O(|\mathcal{S}|^2)$ time for the matrix-vector multiplication. -```{code-cell} +```{code-cell} ipython3 def supremum_norm(v): return jnp.max(jnp.abs(v)) # same as jnp.linalg.norm(v, jnp.inf) @@ -1173,7 +1221,7 @@ Then, as we showed in {eq}`bellman_convergence`, by the Banach fixed-point theor $$\|v^{(t)} - V^\pi \|_{\infty} \le \gamma^{t} \| v^{(0)} - V^\pi\|_{\infty}.$$ -```{code-cell} +```{code-cell} ipython3 iterative_evaluation(tidy_mdp_inf, tidy_policy_messy_only[0]) ``` @@ -1203,7 +1251,6 @@ $\|v^{(0)} - V^\pi\|_{\infty} \le 1/(1-\gamma)$ and $\log (1/x) \ge 1-x$. :::: -(optimal_policy_finite)= ### Optimal policies in infinite-horizon MDPs Now let's move on to solving for an optimal policy in the @@ -1256,7 +1303,7 @@ gives the **Bellman optimality operator** [\mathcal{J}^{\star}(v)](s) = \max_a \left[ r(s, a) + \gamma \E_{s' \sim P(s, a)} v(s') \right] ::: -```{code-cell} +```{code-cell} ipython3 def bellman_optimality_operator(mdp: MDP, v: Float[Array, " S"]) -> Float[Array, " S"]: return jnp.max(mdp.r + mdp.γ * mdp.P @ v, axis=1) @@ -1273,19 +1320,19 @@ operator is a contracting map still holds, and so we can repeatedly apply this operator to converge to the optimal value function! This algorithm is known as **value iteration**. -```{code-cell} +```{code-cell} ipython3 def value_iteration(mdp: MDP, ε: float = 1e-6) -> Float[Array, " S"]: """Iterate the Bellman optimality operator until convergence.""" op = partial(bellman_optimality_operator, mdp) return loop_until_convergence(op, jnp.zeros(mdp.S), ε) ``` -```{code-cell} +```{code-cell} ipython3 value_iteration(tidy_mdp_inf) ``` Note that the runtime analysis for an $\epsilon$-optimal value function -is exactly the same as [iterative policy evaluation](iterative_pe)! This is because value iteration is simply +is exactly the same as [iterative policy evaluation](#iterative_pe)! This is because value iteration is simply the special case of applying iterative policy evaluation to the *optimal* value function. @@ -1315,7 +1362,7 @@ where $\hat \pi(s) = \arg\max_a q(s, a)$ is the greedy policy w.r.t. $$q(s, a) = r(s, a) + \E_{s' \sim P(s, a)} v(s').$$ ::: -:::{dropdown} Proof +:::{prf:proof} Proof We first have $$ @@ -1380,7 +1427,7 @@ iterations to achieve an $\epsilon$-accurate estimate of the optimal value funct Can we mitigate this "greedy worsening"? What if instead of approximating the optimal value function and then acting greedily by it at the very end, we iteratively improve the policy and value function *together*? This is the idea behind **policy iteration**. In each step, we simply set the policy to act greedily with respect to its own value function. -```{code-cell} +```{code-cell} ipython3 def policy_iteration(mdp: MDP, ε=1e-6) -> Float[Array, "S A"]: """Iteratively improve the policy and value function.""" def op(pi): @@ -1389,7 +1436,7 @@ def policy_iteration(mdp: MDP, ε=1e-6) -> Float[Array, "S A"]: return loop_until_convergence(op, π_init, ε) ``` -```{code-cell} +```{code-cell} ipython3 policy_iteration(tidy_mdp_inf) ``` diff --git a/book/myst.yml b/book/myst.yml new file mode 100644 index 0000000..fc00164 --- /dev/null +++ b/book/myst.yml @@ -0,0 +1,37 @@ +version: 1 +project: + title: 'CS/STAT 184: Introduction to Reinforcement Learning' + authors: + - name: Alexander D. Cai + github: adzcai/cs-stat-184-notes + bibliography: + - shared/references.bib + exports: + - format: pdf + template: plain_latex_book + output: exports/book.pdf + toc: + - file: index.md + - file: mdps.md + - file: control.md + - file: bandits.md + - file: supervised_learning.md + - file: fitted_dp.md + - file: pg.md + - file: imitation_learning.md + - file: planning.md + - file: exploration.md + - file: background.md + math: + '\E': '\mathop{\mathbb{E}}' + '\pr': '\mathop{\mathbb{P}}' + '\kl': '\mathrm{KL}\left(#1\parallel#2\right)' + '\ind': '\mathbf{1}\left\{#1\right\}' + '\hi': 'h' # or 't' (time horizon index) + '\hor': 'H' # or 'T' (time horizon) + '\st': 's' # or 'x' for control chapter + '\act': 'a' # or 'u' for control chapter +site: + options: + logo: shared/184.png + template: book-theme diff --git a/book/pg.md b/book/pg.md index 8ee5c0c..c9cc6da 100644 --- a/book/pg.md +++ b/book/pg.md @@ -11,9 +11,10 @@ kernelspec: name: python3 --- -(pg)= # Policy Optimization +## Introduction + The core task of RL is finding the **optimal policy** in a given environment. This is essentially an _optimization problem:_ out of some space of policies, @@ -23,7 +24,7 @@ It's typically intractable to compute the optimal policy exactly. Instead, **policy optimization algorithms** start from some randomly initialized policy, and then _improve_ it step by step. We've already seen some examples of these, -namely {ref}`policy_iteration` for finite MDPs and {ref}`iterative_lqr` in continuous control. +namely [](#policy_iteration) for finite MDPs and [](#iterative_lqr) in continuous control. In particular, we often use policies that can be described by some finite set of _parameters._ For such parameterized policies, we can approximate the **policy gradient:** @@ -40,18 +41,7 @@ a general **optimization method.** This is helpful to stabilize training and widely used in practice. ```{code-cell} ipython3 -import numpy as np -import jax -from jaxtyping import Float, Array -from bokeh.plotting import figure, show, output_notebook -from bokeh.models import Arrow, VeeHead, ColumnDataSource, LinearColorMapper, BasicTicker, ColorBar -from bokeh.transform import linear_cmap -from bokeh.layouts import gridplot -from typing import TypeVar, Callable - -Params = TypeVar("Params") - -output_notebook() +from utils import plt, Array, Callable, jax, jnp ``` ## Gradient Ascent @@ -69,34 +59,35 @@ def f(x, y): """Himmelblau's function""" return (x**2 + y - 11)**2 + (x + y**2 - 7)**2 -x = np.linspace(-5, 5, 400) -y = np.linspace(-5, 5, 400) -X, Y = np.meshgrid(x, y) +# Create a grid of points +x = jnp.linspace(-5, 5, 400) +y = jnp.linspace(-5, 5, 400) +X, Y = jnp.meshgrid(x, y) Z = f(X, Y) -p = figure(width=600, height=600, title="Himmelblau's function") +# Create the plot +fig, ax = plt.subplots(figsize=(6, 6)) -mapper = LinearColorMapper(palette="Viridis256", low=Z.min(), high=Z.max()) -p.image(image=[Z], x=-5, y=-5, dw=10, dh=10, color_mapper=mapper) +# Plot the function using imshow +img = ax.imshow(Z, extent=[-5, 5, -5, 5], origin='lower') -color_bar = ColorBar(color_mapper=mapper) -p.add_layout(color_bar, 'right') +# Add color bar +fig.colorbar(img, ax=ax) -tx, ty = 1., 1. +# Gradient computation using JAX +tx, ty = 1.0, 1.0 gx, gy = jax.grad(f, argnums=(0, 1))(tx, ty) -p.scatter(x=[tx], y=[ty], size=10, color="red") +# Scatter point +ax.scatter(tx, ty, color='red', s=100) + +# Add arrow representing the gradient +ax.arrow(tx, ty, gx * 0.01, gy * 0.01, head_width=0.3, head_length=0.3, fc='blue', ec='blue') -p.add_layout(Arrow( - end=VeeHead(size=15), - x_start=tx, - y_start=ty, - x_end=tx + gx.item() * 0.01, - y_end=ty + gy.item() * 0.01, - line_color="blue", -)) +# Add plot title +ax.set_title("Himmelblau's Function") -show(p) +plt.show() ``` For differentiable functions, this can be thought of as the vector of partial derivatives, @@ -113,7 +104,7 @@ you take the dot product of the difference vector with the gradient. This means that the direction with the highest slope is exactly the gradient itself, so we can describe the gradient ascent algorithm as follows: -:::{prf:algorithm} Gradient ascent +:::{prf:definition} Gradient ascent $$ \begin{pmatrix} x^{k+1} \\ z^{k+1} @@ -190,8 +181,8 @@ In the SL example above, we might randomly choose a *minibatch* of samples and u ```{code-cell} ipython3 def sgd( - θ_init: Params, - estimate_gradient: Callable[[Params], Params], + θ_init: Array, + estimate_gradient: Callable[[Array], Array], η: float, n_steps: int, ): @@ -518,14 +509,14 @@ def pg_with_learned_baseline_pseudocode(env, π, η, θ_init, K, N): trajectories = sample_trajectories(env, π(θ), N) V_hat = fit(trajectories) # estimates the value function of π(θ) τ = sample_trajectories(env, π(θ), 1) - g = np.zeros_like(θ) # gradient estimator + g = jnp.zeros_like(θ) # gradient estimator for h, (s, a) in enumerate(τ): def log_likelihood(θ_): - return np.log(π(θ_)(s, a)) - g += jax.grad(log_likelihood)(θ) * (return_to_go(τ, h) - V_hat(s)) + return jnp.log(π(θ_)(s, a)) + g = g + jax.grad(log_likelihood)(θ) * (return_to_go(τ, h) - V_hat(s)) - θ += η * g + θ = θ + η * g return θ ``` @@ -542,7 +533,7 @@ Note that the gradient estimator will be unbiased regardless of the baseline. -What advantages does the policy gradient algorithm have over {ref}`policy_iteration`? +What advantages does the policy gradient algorithm have over [](#policy_iteration)? :::{note} Policy iteration recap Recall that policy iteration is an algorithm for MDPs with unknown state transitions where we alternate between these two steps: @@ -652,7 +643,7 @@ while ensuring that its trajectory distribution does not change too much: $$ \begin{aligned} -\theta^{k+1} &\gets \argmax_{\theta^{\text{opt}}} \E_{s_0, \dots, s_{H-1} \sim \pi^{k}} \left[ \sum_{\hi=0}^{\hor-1} \E_{a_\hi \sim \pi^{\theta^\text{opt}}(s_\hi)} A^{\pi^{k}}(s_\hi, a_\hi) \right] \\ +\theta^{k+1} &\gets \arg\max_{\theta^{\text{opt}}} \E_{s_0, \dots, s_{H-1} \sim \pi^{k}} \left[ \sum_{\hi=0}^{\hor-1} \E_{a_\hi \sim \pi^{\theta^\text{opt}}(s_\hi)} A^{\pi^{k}}(s_\hi, a_\hi) \right] \\ & \text{where } \text{distance}(\rho_{\theta^{\text{opt}}}, \rho_{\theta^k}) < \delta \end{aligned} $$ @@ -709,7 +700,7 @@ def trpo_pseudocode(env, δ, θ_init, M): kl_div = 0 for τ in trajectories: for s, a, _r in τ: - kl_div += np.log(π(θ)(s, a)) - np.log(π(θ_)(s, a)) + kl_div += jnp.log(π(θ)(s, a)) - jnp.log(π(θ_)(s, a)) return kl_div <= δ θ = optimize(approximate_gain, constraint) @@ -724,7 +715,7 @@ Applying importance sampling allows us to estimate the TRPO objective as follows ::::{prf:definition} Trust region policy optimization (implementation) :label: trpo_implement -:::{prf:algorithmic} TODO +:::{prf:definitionic} TODO Initialize $\theta^0$ Sample $N$ trajectories from $\rho^k$ to learn a value estimator $\tilde b_\hi(s) \approx V^{\pi^k}_\hi(s)$ @@ -732,7 +723,7 @@ Sample $N$ trajectories from $\rho^k$ to learn a value estimator $\tilde b_\hi(s Sample $M$ trajectories $\tau_0, \dots, \tau_{M-1} \sim \rho^k$ $$\begin{gathered} - \theta^{k+1} \gets \argmax_{\theta} \frac{1}{M} \sum_{m=0}^{M-1} \sum_{h=0}^{H-1} \frac{\pi_\theta(a_\hi \mid s_\hi)}{\pi^k(a_\hi \mid s_\hi)} [ R_\hi(\tau_m) - \tilde b_\hi(s_\hi) ] \\ + \theta^{k+1} \gets \arg\max_{\theta} \frac{1}{M} \sum_{m=0}^{M-1} \sum_{h=0}^{H-1} \frac{\pi_\theta(a_\hi \mid s_\hi)}{\pi^k(a_\hi \mid s_\hi)} [ R_\hi(\tau_m) - \tilde b_\hi(s_\hi) ] \\ \text{where } \sum_{m=0}^{M-1} \sum_{h=0}^{H-1} \log \frac{\pi_k(a_\hi^m \mid s_\hi^m)}{\pi_\theta(a_\hi^m \mid s_\hi^m)} \le \delta \end{gathered}$$ @@ -900,7 +891,7 @@ we can instead impose a *soft* constraint by incorporating it into the objective $$ \begin{aligned} -\theta^{k+1} &\gets \argmax_{\theta} \E_{s_0, \dots, s_{H-1} \sim \rho_{\pi^{k}}} \left[ \sum_{\hi=0}^{\hor-1} \E_{a_\hi \sim \pi_{\theta}(s_\hi)} A^{\pi^{k}}(s_\hi, a_\hi) \right] - \lambda \kl{\rho_{\theta}}{\rho_{\theta^k}} +\theta^{k+1} &\gets \arg\max_{\theta} \E_{s_0, \dots, s_{H-1} \sim \rho_{\pi^{k}}} \left[ \sum_{\hi=0}^{\hor-1} \E_{a_\hi \sim \pi_{\theta}(s_\hi)} A^{\pi^{k}}(s_\hi, a_\hi) \right] - \lambda \kl{\rho_{\theta}}{\rho_{\theta^k}} \end{aligned} $$ @@ -936,7 +927,7 @@ we would need the actions to also come from $\pi^k$. This should sound familiar: we want to estimate an expectation over one distribution by sampling from another. -We can once again use {ref}`importance_sampling` to rewrite the inner expectation: +We can once again use [](#importance_sampling) to rewrite the inner expectation: $$ \E_{a_\hi \sim \pi_{\theta}(s_\hi)} A^{\pi^{k}}(s_\hi, a_\hi) @@ -981,7 +972,7 @@ def ppo_pseudocode( total_objective = 0 for τ in sample_trajectories: for s, a, _r in τ: - total_objective += π(θ_opt)(s, a) / π(θ)(s, a) * A_hat(s, a) + λ * np.log(π(θ_opt)(s, a)) + total_objective += π(θ_opt)(s, a) / π(θ)(s, a) * A_hat(s, a) + λ * jnp.log(π(θ_opt)(s, a)) return total_objective / n_sample_trajectories θ = optimize(objective, θ) diff --git a/book/planning.md b/book/planning.md index 6b93bda..de1b99a 100644 --- a/book/planning.md +++ b/book/planning.md @@ -1,15 +1,21 @@ +--- +jupytext: + text_representation: + extension: .md + format_name: myst + format_version: 0.13 + jupytext_version: 1.16.2 +kernelspec: + display_name: Python 3 (ipykernel) + language: python + name: python3 +--- - -+++ - -+++ - -+++ - -(planning)= # Planning +## Introduction + ## Monte Carlo Tree Search (INCOMPLETE) diff --git a/book/shared/deterministic_policy.png b/book/shared/deterministic_policy.png new file mode 100644 index 0000000..ed1e5e1 Binary files /dev/null and b/book/shared/deterministic_policy.png differ diff --git a/book/shared/stochastic_policy.png b/book/shared/stochastic_policy.png new file mode 100644 index 0000000..1fff43f Binary files /dev/null and b/book/shared/stochastic_policy.png differ diff --git a/book/supervised_learning.md b/book/supervised_learning.md index e8d9485..ff8c381 100644 --- a/book/supervised_learning.md +++ b/book/supervised_learning.md @@ -11,9 +11,10 @@ kernelspec: name: python3 --- -(supervised_learning)= # Supervised learning +## Introduction + This section will cover the details of implementing the `fit` function above: That is, how to use a dataset of labelled samples $(x_1, y_1), \dots, (x_N, y_N)$ to find a function $f$ that minimizes the empirical risk. This requires two ingredients: @@ -38,7 +39,9 @@ $$ The most common fitting method for parameterized models is **gradient descent**. -:::{prf:algorithm} Gradient descent +:::{prf:definition} Gradient descent +:label: gd_def + Letting $L(\theta) \in \mathbb{R}$ denote the empirical risk in terms of the parameters, the gradient descent algorithm updates the parameters according to the rule diff --git a/book/utils.py b/book/utils.py new file mode 100644 index 0000000..303c6b1 --- /dev/null +++ b/book/utils.py @@ -0,0 +1,22 @@ +import matplotlib.pyplot as plt + +# convenient class builder +from typing import NamedTuple + +# function typings +from collections.abc import Callable + +# array typings +from jaxtyping import Float, Array + +# convenient function composition +from functools import partial + +# numerical computing and linear algebra +import jax +import jax.numpy as jnp + +# print functions as latex +import latexify + +plt.style.use("fivethirtyeight") diff --git a/environment.yml b/environment.yml index 767195a..d131e0b 100644 --- a/environment.yml +++ b/environment.yml @@ -1,26 +1,21 @@ name: rlbook channels: - conda-forge - - plotly dependencies: - # code - - python 3.11.* - - jax 0.4.* - - equinox 0.11.* - - ruff - # visualization - - bokeh - - jupyter_bokeh - - tqdm - - latexify-py - # book - - jupyter-book - - jupyterlab - - jupytext 1.16.2 - - swig - # github pages - - ghp-import - # pip + - python 3.11.* # python + - jax 0.4.* # automatic differentiation + - jaxtyping # array types + - ruff # code formatting + - matplotlib # visualization + - tqdm # progress bars + - latexify-py # python to latex + - texlive-core # latex distribution + - latexmk # latex projects + - mystmd # markdown notebooks + - jupyterlab # notebooks + - jupyterlab-myst # enable mystmd in jupyter + - jupytext 1.16.2 # open md in jupyter + - ghp-import # github pages - pip: - "gymnasium[box2d]" - sphinx-proof diff --git a/glossary.tex b/glossary.tex deleted file mode 100644 index cdacf73..0000000 --- a/glossary.tex +++ /dev/null @@ -1,25 +0,0 @@ -\usepackage[toc]{glossaries} - -\makeglossaries - -\newglossaryentry{value function}{ - name=value function, - description={The expected total reward when acting according to a given policy in a given MDP from a given state} -} - -\newglossaryentry{q function}{ - name=q function, - description={The expected total reward when acting according to a given policy in a given MDP from a given state and action} -} - -\newglossaryentry{exploration-exploitation tradeoff}{ - name=exploration-exploitation tradeoff, - description={The tradeoff between exploiting actions we know to be good and exploring new actions that could be better} -} - - - -% ==================== ACRONYMS ==================== - -\newacronym{mab}{MAB}{multi-armed bandit} -\newacronym{etc}{ETC}{explore-then-commit} diff --git a/main.bib b/main.bib deleted file mode 100644 index 1eba405..0000000 --- a/main.bib +++ /dev/null @@ -1,333 +0,0 @@ -@inreference{achiam_spinning_2018, - title = {Spinning {{Up}} in {{Deep Reinforcement Learning}}}, - author = {Achiam, Joshua}, - date = {2018}, - url = {https://spinningup.openai.com/en/latest/index.html}, - urldate = {2024-07-01}, - file = {/Users/alexandercai/Zotero/storage/UPUMW6XV/index.html} -} - -@online{adaptive_agent_team_human-timescale_2023, - title = {Human-{{Timescale Adaptation}} in an {{Open-Ended Task Space}}}, - author = {Adaptive Agent Team and Bauer, Jakob and Baumli, Kate and Baveja, Satinder and Behbahani, Feryal and Bhoopchand, Avishkar and Bradley-Schmieg, Nathalie and Chang, Michael and Clay, Natalie and Collister, Adrian and Dasagi, Vibhavari and Gonzalez, Lucy and Gregor, Karol and Hughes, Edward and Kashem, Sheleem and Loks-Thompson, Maria and Openshaw, Hannah and Parker-Holder, Jack and Pathak, Shreya and Perez-Nieves, Nicolas and Rakicevic, Nemanja and Rocktäschel, Tim and Schroecker, Yannick and Sygnowski, Jakub and Tuyls, Karl and York, Sarah and Zacherl, Alexander and Zhang, Lei}, - date = {2023-01-18}, - eprint = {2301.07608}, - eprinttype = {arXiv}, - eprintclass = {cs}, - url = {http://arxiv.org/abs/2301.07608}, - urldate = {2023-02-21}, - abstract = {Foundation models have shown impressive adaptation and scalability in supervised and self-supervised learning problems, but so far these successes have not fully translated to reinforcement learning (RL). In this work, we demonstrate that training an RL agent at scale leads to a general in-context learning algorithm that can adapt to open-ended novel embodied 3D problems as quickly as humans. In a vast space of held-out environment dynamics, our adaptive agent (AdA) displays on-the-fly hypothesis-driven exploration, efficient exploitation of acquired knowledge, and can successfully be prompted with first-person demonstrations. Adaptation emerges from three ingredients: (1) meta-reinforcement learning across a vast, smooth and diverse task distribution, (2) a policy parameterised as a large-scale attention-based memory architecture, and (3) an effective automated curriculum that prioritises tasks at the frontier of an agent's capabilities. We demonstrate characteristic scaling laws with respect to network size, memory length, and richness of the training task distribution. We believe our results lay the foundation for increasingly general and adaptive RL agents that perform well across ever-larger open-ended domains.}, - pubstate = {prepublished}, - keywords = {Computer Science - Artificial Intelligence,Computer Science - Machine Learning,Computer Science - Neural and Evolutionary Computing}, - annotation = {1 citations (Semantic Scholar/arXiv) [2023-02-20]}, - file = {/Users/alexandercai/Library/CloudStorage/GoogleDrive-alexcai@college.harvard.edu/My Drive/Vault/papers/assets/2023/Human-Timescale Adaptation in an Open-Ended Task Space (2023) - Adaptive Agent Team et al.pdf} -} - -@book{agarwal_reinforcement_2022, - title = {Reinforcement {{Learning}}: {{Theory}} and {{Algorithms}}}, - shorttitle = {{{AJKS}}}, - author = {Agarwal, Alekh and Jiang, Nan and Kakade, Sham M and Sun, Wen}, - date = {2022-01-31}, - url = {https://rltheorybook.github.io/rltheorybook_AJKS.pdf}, - langid = {english}, - file = {/Users/alexandercai/Library/CloudStorage/GoogleDrive-alexcai@college.harvard.edu/My Drive/Vault/papers/assets/2022/Reinforcement Learning (2022) - Agarwal, Jiang, Kakade, Sun.pdf} -} - -@inproceedings{azar_minimax_2017, - title = {Minimax {{Regret Bounds}} for {{Reinforcement Learning}}}, - booktitle = {Proceedings of the 34th {{International Conference}} on {{Machine Learning}}}, - author = {Azar, Mohammad Gheshlaghi and Osband, Ian and Munos, Rémi}, - date = {2017-07-17}, - pages = {263--272}, - publisher = {PMLR}, - issn = {2640-3498}, - url = {https://proceedings.mlr.press/v70/azar17a.html}, - urldate = {2024-06-21}, - abstract = {We consider the problem of provably optimal exploration in reinforcement learning for finite horizon MDPs. We show that an optimistic modification to value iteration achieves a regret bound of \$\textbackslash tilde \{O\}( \textbackslash sqrt\{HSAT\} + H\textasciicircum 2S\textasciicircum 2A+H\textbackslash sqrt\{T\})\$ where \$H\$ is the time horizon, \$S\$ the number of states, \$A\$ the number of actions and \$T\$ the number of time-steps. This result improves over the best previous known bound \$\textbackslash tilde \{O\}(HS \textbackslash sqrt\{AT\})\$ achieved by the UCRL2 algorithm. The key significance of our new results is that when \$T\textbackslash geq H\textasciicircum 3S\textasciicircum 3A\$ and \$SA\textbackslash geq H\$, it leads to a regret of \$\textbackslash tilde\{O\}(\textbackslash sqrt\{HSAT\})\$ that matches the established lower bound of \$\textbackslash Omega(\textbackslash sqrt\{HSAT\})\$ up to a logarithmic factor. Our analysis contain two key insights. We use careful application of concentration inequalities to the optimal value function as a whole, rather than to the transitions probabilities (to improve scaling in \$S\$), and we define Bernstein-based “exploration bonuses” that use the empirical variance of the estimated values at the next states (to improve scaling in \$H\$).}, - eventtitle = {International {{Conference}} on {{Machine Learning}}}, - langid = {english}, - file = {/Users/alexandercai/Library/CloudStorage/GoogleDrive-alexcai@college.harvard.edu/My Drive/Vault/papers/assets/2017/Minimax Regret Bounds for Reinforcement Learning (2017) - Azar, Osband, Munos.pdf} -} - -@software{babuschkin_deepmind_2020, - title = {The {{DeepMind JAX Ecosystem}}}, - author = {Babuschkin, Igor and Baumli, Kate and Bell, Alison and Bhupatiraju, Surya and Bruce, Jake and Buchlovsky, Peter and Budden, David and Cai, Trevor and Clark, Aidan and Danihelka, Ivo and Dedieu, Antoine and Fantacci, Claudio and Godwin, Jonathan and Jones, Chris and Hemsley, Ross and Hennigan, Tom and Hessel, Matteo and Hou, Shaobo and Kapturowski, Steven and Keck, Thomas and Kemaev, Iurii and King, Michael and Kunesch, Markus and Martens, Lena and Merzic, Hamza and Mikulik, Vladimir and Norman, Tamara and Papamakarios, George and Quan, John and Ring, Roman and Ruiz, Francisco and Sanchez, Alvaro and Schneider, Rosalia and Sezener, Eren and Spencer, Stephen and Srinivasan, Srivatsan and Stokowiec, Wojciech and Wang, Luyu and Zhou, Guangyao and Viola, Fabio}, - date = {2020}, - url = {http://github.com/deepmind} -} - -@article{barto_neuronlike_1983, - title = {Neuronlike Adaptive Elements That Can Solve Difficult Learning Control Problems}, - author = {Barto, Andrew G. and Sutton, Richard S. and Anderson, Charles W.}, - date = {1983-09}, - journaltitle = {IEEE Transactions on Systems, Man, and Cybernetics}, - volume = {SMC-13}, - number = {5}, - pages = {834--846}, - issn = {2168-2909}, - doi = {10.1109/TSMC.1983.6313077}, - url = {https://ieeexplore.ieee.org/document/6313077}, - urldate = {2024-07-01}, - abstract = {It is shown how a system consisting of two neuronlike adaptive elements can solve a difficult learning control problem. The task is to balance a pole that is hinged to a movable cart by applying forces to the cart's base. It is argued that the learning problems faced by adaptive elements that are components of adaptive networks are at least as difficult as this version of the pole-balancing problem. The learning system consists of a single associative search element (ASE) and a single adaptive critic element (ACE). In the course of learning to balance the pole, the ASE constructs associations between input and output by searching under the influence of reinforcement feedback, and the ACE constructs a more informative evaluation function than reinforcement feedback alone can provide. The differences between this approach and other attempts to solve problems using neurolike elements are discussed, as is the relation of this work to classical and instrumental conditioning in animal learning studies and its possible implications for research in the neurosciences.}, - eventtitle = {{{IEEE Transactions}} on {{Systems}}, {{Man}}, and {{Cybernetics}}}, - keywords = {Adaptive systems,Biological neural networks,Neurons,Pattern recognition,Problem-solving,Supervised learning,Training}, - file = {/Users/alexandercai/Zotero/storage/GHD9WZXL/6313077.html} -} - -@article{degrave_magnetic_2022, - title = {Magnetic Control of Tokamak Plasmas through Deep Reinforcement Learning}, - author = {Degrave, Jonas and Felici, Federico and Buchli, Jonas and Neunert, Michael and Tracey, Brendan and Carpanese, Francesco and Ewalds, Timo and Hafner, Roland and Abdolmaleki, Abbas and family=Casas, given=Diego, prefix=de las, useprefix=true and Donner, Craig and Fritz, Leslie and Galperti, Cristian and Huber, Andrea and Keeling, James and Tsimpoukelli, Maria and Kay, Jackie and Merle, Antoine and Moret, Jean-Marc and Noury, Seb and Pesamosca, Federico and Pfau, David and Sauter, Olivier and Sommariva, Cristian and Coda, Stefano and Duval, Basil and Fasoli, Ambrogio and Kohli, Pushmeet and Kavukcuoglu, Koray and Hassabis, Demis and Riedmiller, Martin}, - date = {2022-02}, - journaltitle = {Nature}, - volume = {602}, - number = {7897}, - pages = {414--419}, - publisher = {Nature Publishing Group}, - issn = {1476-4687}, - doi = {10.1038/s41586-021-04301-9}, - url = {https://www.nature.com/articles/s41586-021-04301-9}, - urldate = {2023-05-21}, - abstract = {Nuclear fusion using magnetic confinement, in particular in the tokamak configuration, is a promising path towards sustainable energy. A core challenge is to shape and maintain a high-temperature plasma within the tokamak vessel. This requires high-dimensional, high-frequency, closed-loop control using magnetic actuator coils, further complicated by the diverse requirements across a wide range of plasma configurations. In this work, we introduce a previously undescribed architecture for tokamak magnetic controller design that autonomously learns to command the full set of control coils. This architecture meets control objectives specified at a high level, at the same time satisfying physical and operational constraints. This approach has unprecedented flexibility and generality in problem specification and yields a notable reduction in design effort to produce new plasma configurations. We successfully produce and control a diverse set of plasma configurations on the Tokamak à Configuration Variable1,2, including elongated, conventional shapes, as well as advanced configurations, such as negative triangularity and ‘snowflake’ configurations. Our approach achieves accurate tracking of the location, current and shape for these configurations. We also demonstrate sustained ‘droplets’ on TCV, in which two separate plasmas are maintained simultaneously within the vessel. This represents a notable advance for tokamak feedback control, showing the potential of reinforcement learning to accelerate research in the fusion domain, and is one of the most challenging real-world systems to which reinforcement learning has been applied.}, - issue = {7897}, - langid = {english}, - keywords = {Computer science,Magnetically confined plasmas,Nuclear fusion and fission}, - annotation = {230 citations (Semantic Scholar/DOI) [2023-05-21]}, - file = {/Users/alexandercai/Library/CloudStorage/GoogleDrive-alexcai@college.harvard.edu/My Drive/Vault/papers/assets/2022/Magnetic control of tokamak plasmas through deep reinforcement learning (2022) - Degrave et al.pdf} -} - -@inproceedings{freeman_brax_2021, - title = {Brax – {{A Differentiable Physics Engine}} for {{Large Scale Rigid Body Simulation}}}, - booktitle = {{{NeurIPS Datasets}} and {{Benchmarks}} 2021}, - author = {Freeman, C. Daniel and Frey, Erik and Raichuk, Anton and Girgin, Sertan and Mordatch, Igor and Bachem, Olivier}, - date = {2021-06-24}, - eprint = {2106.13281}, - eprinttype = {arXiv}, - eprintclass = {cs}, - doi = {10.48550/arXiv.2106.13281}, - url = {https://datasets-benchmarks-proceedings.neurips.cc/paper/2021/hash/d1f491a404d6854880943e5c3cd9ca25-Abstract-round1.html}, - urldate = {2023-06-26}, - abstract = {We present Brax, an open source library for rigid body simulation with a focus on performance and parallelism on accelerators, written in JAX. We present results on a suite of tasks inspired by the existing reinforcement learning literature, but remade in our engine. Additionally, we provide reimplementations of PPO, SAC, ES, and direct policy optimization in JAX that compile alongside our environments, allowing the learning algorithm and the environment processing to occur on the same device, and to scale seamlessly on accelerators. Finally, we include notebooks that facilitate training of performant policies on common OpenAI Gym MuJoCo-like tasks in minutes.}, - eventtitle = {{{NeurIPS Datasets}} and {{Benchmarks}}}, - pubstate = {preprint | DBLP: https://dblp.org/rec/conf/nips/FreemanFRGMB21}, - keywords = {Computer Science - Artificial Intelligence,Computer Science - Robotics}, - annotation = {151 citations (Semantic Scholar/arXiv) [2023-07-22]}, - file = {/Users/alexandercai/Library/CloudStorage/GoogleDrive-alexcai@college.harvard.edu/My Drive/Vault/papers/assets/2021/Brax – A Differentiable Physics Engine for Large Scale Rigid Body Simulation (2021) - Freeman et al.pdf} -} - -@online{hausknecht_deep_2017, - title = {Deep {{Recurrent Q-Learning}} for {{Partially Observable MDPs}}}, - author = {Hausknecht, Matthew and Stone, Peter}, - date = {2017-01-11}, - eprint = {1507.06527}, - eprinttype = {arXiv}, - eprintclass = {cs}, - doi = {10.48550/arXiv.1507.06527}, - url = {http://arxiv.org/abs/1507.06527}, - urldate = {2023-06-04}, - abstract = {Deep Reinforcement Learning has yielded proficient controllers for complex tasks. However, these controllers have limited memory and rely on being able to perceive the complete game screen at each decision point. To address these shortcomings, this article investigates the effects of adding recurrency to a Deep Q-Network (DQN) by replacing the first post-convolutional fully-connected layer with a recurrent LSTM. The resulting \textbackslash textit\{Deep Recurrent Q-Network\} (DRQN), although capable of seeing only a single frame at each timestep, successfully integrates information through time and replicates DQN's performance on standard Atari games and partially observed equivalents featuring flickering game screens. Additionally, when trained with partial observations and evaluated with incrementally more complete observations, DRQN's performance scales as a function of observability. Conversely, when trained with full observations and evaluated with partial observations, DRQN's performance degrades less than DQN's. Thus, given the same length of history, recurrency is a viable alternative to stacking a history of frames in the DQN's input layer and while recurrency confers no systematic advantage when learning to play the game, the recurrent net can better adapt at evaluation time if the quality of observations changes.}, - pubstate = {prepublished}, - keywords = {Computer Science - Machine Learning}, - annotation = {1274 citations (Semantic Scholar/arXiv) [2023-06-04]}, - file = {/Users/alexandercai/Library/CloudStorage/GoogleDrive-alexcai@college.harvard.edu/My Drive/Vault/papers/assets/2017/Deep Recurrent Q-Learning for Partially Observable MDPs (2017) - Hausknecht, Stone.pdf} -} - -@book{kochenderfer_algorithms_2022, - title = {Algorithms for {{Decision Making}}}, - author = {Kochenderfer, Mykel J and Wheeler, Tim A and Wray, Kyle H}, - date = {2022-08-16}, - url = {https://mitpress.mit.edu/9780262047012/algorithms-for-decision-making/}, - urldate = {2022-10-23}, - abstract = {A broad introduction to algorithms for decision making under uncertainty, introducing the underlying mathematical problem formulations and the algorithms for...}, - isbn = {978-0-262-04701-2}, - langid = {american}, - file = {/Users/alexandercai/Library/CloudStorage/GoogleDrive-alexcai@college.harvard.edu/My Drive/Vault/papers/assets/2022/Algorithms for Decision Making (2022) - Kochenderfer, Wheeler, Wray.pdf} -} - -@article{lai_asymptotically_1985, - title = {Asymptotically Efficient Adaptive Allocation Rules}, - author = {Lai, T. L and Robbins, Herbert}, - date = {1985-03-01}, - journaltitle = {Advances in Applied Mathematics}, - shortjournal = {Advances in Applied Mathematics}, - volume = {6}, - number = {1}, - pages = {4--22}, - issn = {0196-8858}, - doi = {10.1016/0196-8858(85)90002-8}, - url = {https://www.sciencedirect.com/science/article/pii/0196885885900028}, - urldate = {2023-10-23}, - file = {/Users/alexandercai/Library/CloudStorage/GoogleDrive-alexcai@college.harvard.edu/My Drive/Vault/papers/assets/1985/Asymptotically efficient adaptive allocation rules (1985) - Lai, Robbins.pdf} -} - -@inproceedings{lechner_gigastep_2023, - title = {Gigastep - {{One Billion Steps}} per {{Second Multi-agent Reinforcement Learning}}}, - author = {Lechner, Mathias and Yin, Lianhao and Seyde, Tim and Wang, Tsun-Hsuan and Xiao, Wei and Hasani, Ramin and Rountree, Joshua and Rus, Daniela}, - date = {2023-11-02}, - url = {https://openreview.net/forum?id=UgPAaEugH3}, - urldate = {2023-12-12}, - abstract = {Multi-agent reinforcement learning (MARL) research is faced with a trade-off: it either uses complex environments requiring large compute resources, which makes it inaccessible to researchers with limited resources, or relies on simpler dynamics for faster execution, which makes the transferability of the results to more realistic tasks challenging. Motivated by these challenges, we present Gigastep, a fully vectorizable, MARL environment implemented in JAX, capable of executing up to one billion environment steps per second on consumer-grade hardware. Its design allows for comprehensive MARL experimentation, including a complex, high-dimensional space defined by 3D dynamics, stochasticity, and partial observations. Gigastep supports both collaborative and adversarial tasks, continuous and discrete action spaces, and provides RGB image and feature vector observations, allowing the evaluation of a wide range of MARL algorithms. We validate Gigastep's usability through an extensive set of experiments, underscoring its role in widening participation and promoting inclusivity in the MARL research community.}, - eventtitle = {Thirty-Seventh {{Conference}} on {{Neural Information Processing Systems Datasets}} and {{Benchmarks Track}}}, - langid = {english}, - file = {/Users/alexandercai/Library/CloudStorage/GoogleDrive-alexcai@college.harvard.edu/My Drive/Vault/papers/assets/2023/Gigastep - One Billion Steps per Second Multi-agent Reinforcement Learning (2023) - Lechner et al.pdf} -} - -@article{mnih_playing_2013, - title = {Playing {{Atari}} with {{Deep Reinforcement Learning}}}, - author = {Mnih, Volodymyr and Kavukcuoglu, Koray and Silver, David and Graves, Alex and Antonoglou, Ioannis and Wierstra, Daan and Riedmiller, Martin A.}, - date = {2013}, - journaltitle = {CoRR}, - volume = {abs/1312.5602}, - eprint = {1312.5602}, - eprinttype = {arXiv}, - url = {http://arxiv.org/abs/1312.5602}, - urldate = {2024-06-21}, - file = {/Users/alexandercai/Library/CloudStorage/GoogleDrive-alexcai@college.harvard.edu/My Drive/Vault/papers/assets/2013/Playing Atari with Deep Reinforcement Learning (2013) - Mnih et al.pdf} -} - -@book{nielsen_neural_2015, - title = {Neural {{Networks}} and {{Deep Learning}}}, - author = {Nielsen, Michael A.}, - date = {2015}, - publisher = {Determination Press}, - url = {http://neuralnetworksanddeeplearning.com/}, - urldate = {2024-03-10} -} - -@inproceedings{ross_reduction_2010, - title = {A {{Reduction}} of {{Imitation Learning}} and {{Structured Prediction}} to {{No-Regret Online Learning}}}, - author = {Ross, Stéphane and Gordon, Geoffrey J. and Bagnell, J.}, - date = {2010-11-02}, - url = {https://www.semanticscholar.org/paper/A-Reduction-of-Imitation-Learning-and-Structured-to-Ross-Gordon/79ab3c49903ec8cb339437ccf5cf998607fc313e}, - urldate = {2024-08-08}, - abstract = {Sequential prediction problems such as imitation learning, where future observations depend on previous predictions (actions), violate the common i.i.d. assumptions made in statistical learning. This leads to poor performance in theory and often in practice. Some recent approaches provide stronger guarantees in this setting, but remain somewhat unsatisfactory as they train either non-stationary or stochastic policies and require a large number of iterations. In this paper, we propose a new iterative algorithm, which trains a stationary deterministic policy, that can be seen as a no regret algorithm in an online learning setting. We show that any such no regret algorithm, combined with additional reduction assumptions, must find a policy with good performance under the distribution of observations it induces in such sequential settings. We demonstrate that this new approach outperforms previous approaches on two challenging imitation learning problems and a benchmark sequence labeling problem.}, - eventtitle = {International {{Conference}} on {{Artificial Intelligence}} and {{Statistics}}}, - file = {/Users/alexandercai/Library/CloudStorage/GoogleDrive-alexcai@college.harvard.edu/My Drive/Vault/papers/assets/2010/A Reduction of Imitation Learning and Structured Prediction to No-Regret Online (2010) - Ross, Gordon, Bagnell.pdf} -} - -@online{sun_easy--hard_2024, - title = {Easy-to-{{Hard Generalization}}: {{Scalable Alignment Beyond Human Supervision}}}, - shorttitle = {Easy-to-{{Hard Generalization}}}, - author = {Sun, Zhiqing and Yu, Longhui and Shen, Yikang and Liu, Weiyang and Yang, Yiming and Welleck, Sean and Gan, Chuang}, - date = {2024-03-14}, - eprint = {2403.09472}, - eprinttype = {arXiv}, - eprintclass = {cs}, - doi = {10.48550/arXiv.2403.09472}, - url = {http://arxiv.org/abs/2403.09472}, - urldate = {2024-07-01}, - abstract = {Current AI alignment methodologies rely on human-provided demonstrations or judgments, and the learned capabilities of AI systems would be upper-bounded by human capabilities as a result. This raises a challenging research question: How can we keep improving the systems when their capabilities have surpassed the levels of humans? This paper answers this question in the context of tackling hard reasoning tasks (e.g., level 4-5 MATH problems) via learning from human annotations on easier tasks (e.g., level 1-3 MATH problems), which we term as \textbackslash textit\{easy-to-hard generalization\}. Our key insight is that an evaluator (reward model) trained on supervisions for easier tasks can be effectively used for scoring candidate solutions of harder tasks and hence facilitating easy-to-hard generalization over different levels of tasks. Based on this insight, we propose a novel approach to scalable alignment, which firstly trains the process-supervised reward models on easy problems (e.g., level 1-3), and then uses them to evaluate the performance of policy models on hard problems. We show that such \textbackslash textit\{easy-to-hard generalization from evaluators\} can enable \textbackslash textit\{easy-to-hard generalizations in generators\} either through re-ranking or reinforcement learning (RL). Notably, our process-supervised 7b RL model achieves an accuracy of 34.0\textbackslash\% on MATH500, despite only using human supervision on easy problems. Our approach suggests a promising path toward AI systems that advance beyond the frontier of human supervision.}, - pubstate = {prepublished}, - keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning}, - file = {/Users/alexandercai/Library/CloudStorage/GoogleDrive-alexcai@college.harvard.edu/My Drive/Vault/papers/assets/2024/Easy-to-Hard Generalization (2024) - Sun et al.pdf;/Users/alexandercai/Zotero/storage/J52D59AK/2403.html} -} - -@book{sussman_functional_2013, - title = {Functional Differential Geometry}, - author = {Sussman, Gerald Jay and Wisdom, Jack and Farr, Will}, - date = {2013}, - publisher = {The MIT Press}, - location = {Cambridge, MA}, - isbn = {978-0-262-01934-7}, - pagetotal = {228}, - keywords = {Functional differential equations,Geometry Differential,Mathematical physics}, - file = {/Users/alexandercai/Library/CloudStorage/GoogleDrive-alexcai@college.harvard.edu/My Drive/Vault/papers/assets/2013/Functional differential geometry (2013) - Sussman, Wisdom, Farr.pdf} -} - -@book{sutton_reinforcement_2018, - title = {Reinforcement Learning: An Introduction}, - shorttitle = {Reinforcement Learning}, - author = {Sutton, Richard S. and Barto, Andrew G.}, - date = {2018}, - series = {Adaptive Computation and Machine Learning Series}, - edition = {Second edition}, - publisher = {The MIT Press}, - location = {Cambridge, Massachusetts}, - url = {http://incompleteideas.net/book/RLbook2020trimmed.pdf}, - abstract = {"Reinforcement learning, one of the most active research areas in artificial intelligence, is a computational approach to learning whereby an agent tries to maximize the total amount of reward it receives while interacting with a complex, uncertain environment. In Reinforcement Learning, Richard Sutton and Andrew Barto provide a clear and simple account of the field's key ideas and algorithms."--}, - isbn = {978-0-262-03924-6}, - langid = {english}, - pagetotal = {526}, - keywords = {Reinforcement learning}, - file = {/Users/alexandercai/Library/CloudStorage/GoogleDrive-alexcai@college.harvard.edu/My Drive/Vault/papers/assets/2018/Reinforcement learning (2018) - Sutton, Barto.pdf} -} - -@book{vershynin_high-dimensional_2018, - title = {High-{{Dimensional Probability}}: {{An Introduction}} with {{Applications}} in {{Data Science}}}, - shorttitle = {High-{{Dimensional Probability}}}, - author = {Vershynin, Roman}, - date = {2018-09-27}, - eprint = {NDdqDwAAQBAJ}, - eprinttype = {googlebooks}, - publisher = {Cambridge University Press}, - abstract = {High-dimensional probability offers insight into the behavior of random vectors, random matrices, random subspaces, and objects used to quantify uncertainty in high dimensions. Drawing on ideas from probability, analysis, and geometry, it lends itself to applications in mathematics, statistics, theoretical computer science, signal processing, optimization, and more. It is the first to integrate theory, key tools, and modern applications of high-dimensional probability. Concentration inequalities form the core, and it covers both classical results such as Hoeffding's and Chernoff's inequalities and modern developments such as the matrix Bernstein's inequality. It then introduces the powerful methods based on stochastic processes, including such tools as Slepian's, Sudakov's, and Dudley's inequalities, as well as generic chaining and bounds based on VC dimension. A broad range of illustrations is embedded throughout, including classical and modern results for covariance estimation, clustering, networks, semidefinite programming, coding, dimension reduction, matrix completion, machine learning, compressed sensing, and sparse regression.}, - isbn = {978-1-108-41519-4}, - langid = {english}, - pagetotal = {299}, - keywords = {Business & Economics / Econometrics,Computers / Optical Data Processing,Language Arts & Disciplines / Library & Information Science / General,Mathematics / Probability & Statistics / General,Technology & Engineering / Signals & Signal Processing}, - file = {/Users/alexandercai/Library/CloudStorage/GoogleDrive-alexcai@college.harvard.edu/My Drive/Vault/papers/assets/2018/High-Dimensional Probability (2018) - Vershynin.pdf} -} - -@online{welleck_decoding_2024, - title = {From {{Decoding}} to {{Meta-Generation}}: {{Inference-time Algorithms}} for {{Large Language Models}}}, - shorttitle = {From {{Decoding}} to {{Meta-Generation}}}, - author = {Welleck, Sean and Bertsch, Amanda and Finlayson, Matthew and Schoelkopf, Hailey and Xie, Alex and Neubig, Graham and Kulikov, Ilia and Harchaoui, Zaid}, - date = {2024-06-24}, - eprint = {2406.16838}, - eprinttype = {arXiv}, - eprintclass = {cs}, - doi = {10.48550/arXiv.2406.16838}, - url = {http://arxiv.org/abs/2406.16838}, - urldate = {2024-07-01}, - abstract = {One of the most striking findings in modern research on large language models (LLMs) is that scaling up compute during training leads to better results. However, less attention has been given to the benefits of scaling compute during inference. This survey focuses on these inference-time approaches. We explore three areas under a unified mathematical formalism: token-level generation algorithms, meta-generation algorithms, and efficient generation. Token-level generation algorithms, often called decoding algorithms, operate by sampling a single token at a time or constructing a token-level search space and then selecting an output. These methods typically assume access to a language model's logits, next-token distributions, or probability scores. Meta-generation algorithms work on partial or full sequences, incorporating domain knowledge, enabling backtracking, and integrating external information. Efficient generation methods aim to reduce token costs and improve the speed of generation. Our survey unifies perspectives from three research communities: traditional natural language processing, modern LLMs, and machine learning systems.}, - pubstate = {prepublished}, - keywords = {Computer Science - Computation and Language,Computer Science - Machine Learning}, - file = {/Users/alexandercai/Library/CloudStorage/GoogleDrive-alexcai@college.harvard.edu/My Drive/Vault/papers/assets/2024/From Decoding to Meta-Generation (2024) - Welleck et al.pdf;/Users/alexandercai/Zotero/storage/S4Y984R4/2406.html} -} - -@online{zhai_fine-tuning_2024, - title = {Fine-{{Tuning Large Vision-Language Models}} as {{Decision-Making Agents}} via {{Reinforcement Learning}}}, - author = {Zhai, Yuexiang and Bai, Hao and Lin, Zipeng and Pan, Jiayi and Tong, Shengbang and Zhou, Yifei and Suhr, Alane and Xie, Saining and LeCun, Yann and Ma, Yi and Levine, Sergey}, - date = {2024-05-16}, - eprint = {2405.10292}, - eprinttype = {arXiv}, - eprintclass = {cs}, - doi = {10.48550/arXiv.2405.10292}, - url = {http://arxiv.org/abs/2405.10292}, - urldate = {2024-07-01}, - abstract = {Large vision-language models (VLMs) fine-tuned on specialized visual instruction-following data have exhibited impressive language reasoning capabilities across various scenarios. However, this fine-tuning paradigm may not be able to efficiently learn optimal decision-making agents in multi-step goal-directed tasks from interactive environments. To address this challenge, we propose an algorithmic framework that fine-tunes VLMs with reinforcement learning (RL). Specifically, our framework provides a task description and then prompts the VLM to generate chain-of-thought (CoT) reasoning, enabling the VLM to efficiently explore intermediate reasoning steps that lead to the final text-based action. Next, the open-ended text output is parsed into an executable action to interact with the environment to obtain goal-directed task rewards. Finally, our framework uses these task rewards to fine-tune the entire VLM with RL. Empirically, we demonstrate that our proposed framework enhances the decision-making capabilities of VLM agents across various tasks, enabling 7b models to outperform commercial models such as GPT4-V or Gemini. Furthermore, we find that CoT reasoning is a crucial component for performance improvement, as removing the CoT reasoning results in a significant decrease in the overall performance of our method.}, - pubstate = {prepublished}, - keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Computer Vision and Pattern Recognition,Computer Science - Machine Learning}, - file = {/Users/alexandercai/Library/CloudStorage/GoogleDrive-alexcai@college.harvard.edu/My Drive/Vault/papers/assets/2024/Fine-Tuning Large Vision-Language Models as Decision-Making Agents via (2024) - Zhai et al.pdf;/Users/alexandercai/Zotero/storage/2X2WJU4D/2405.html} -} - -@online{zhang_adaptable_2024, - title = {Adaptable {{Logical Control}} for {{Large Language Models}}}, - author = {Zhang, Honghua and Kung, Po-Nien and Yoshida, Masahiro and family=Broeck, given=Guy Van, prefix=den, useprefix=false and Peng, Nanyun}, - date = {2024-06-19}, - eprint = {2406.13892}, - eprinttype = {arXiv}, - eprintclass = {cs}, - doi = {10.48550/arXiv.2406.13892}, - url = {http://arxiv.org/abs/2406.13892}, - urldate = {2024-07-01}, - abstract = {Despite the success of Large Language Models (LLMs) on various tasks following human instructions, controlling model generation at inference time poses a persistent challenge. In this paper, we introduce Ctrl-G, an adaptable framework that facilitates tractable and flexible control of LLM generation to reliably follow logical constraints. Ctrl-G combines any production-ready LLM with a Hidden Markov Model, enabling LLM outputs to adhere to logical constraints represented as deterministic finite automata. We show that Ctrl-G, when applied to a TULU2-7B model, outperforms GPT3.5 and GPT4 on the task of interactive text editing: specifically, for the task of generating text insertions/continuations following logical constraints, Ctrl-G achieves over 30\% higher satisfaction rate in human evaluation compared to GPT4. When applied to medium-size language models (e.g., GPT2-large), Ctrl-G also beats its counterparts for constrained generation by large margins on standard benchmarks. Additionally, as a proof-of-concept study, we experiment Ctrl-G on the Grade School Math benchmark to assist LLM reasoning, foreshadowing the application of Ctrl-G, as well as other constrained generation approaches, beyond traditional language generation tasks.}, - pubstate = {prepublished}, - keywords = {Computer Science - Computation and Language}, - file = {/Users/alexandercai/Library/CloudStorage/GoogleDrive-alexcai@college.harvard.edu/My Drive/Vault/papers/assets/2024/Adaptable Logical Control for Large Language Models (2024) - Zhang, Kung, Yoshida, Broeck, Peng.pdf;/Users/alexandercai/Zotero/storage/38W8T74Y/2406.html} -} - -@online{zhang_deep_2015, - title = {Deep Learning with {{Elastic Averaging SGD}}}, - author = {Zhang, Sixin and Choromanska, Anna and LeCun, Yann}, - date = {2015-10-25}, - eprint = {1412.6651}, - eprinttype = {arXiv}, - eprintclass = {cs, stat}, - doi = {10.48550/arXiv.1412.6651}, - url = {http://arxiv.org/abs/1412.6651}, - urldate = {2024-07-01}, - abstract = {We study the problem of stochastic optimization for deep learning in the parallel computing environment under communication constraints. A new algorithm is proposed in this setting where the communication and coordination of work among concurrent processes (local workers), is based on an elastic force which links the parameters they compute with a center variable stored by the parameter server (master). The algorithm enables the local workers to perform more exploration, i.e. the algorithm allows the local variables to fluctuate further from the center variable by reducing the amount of communication between local workers and the master. We empirically demonstrate that in the deep learning setting, due to the existence of many local optima, allowing more exploration can lead to the improved performance. We propose synchronous and asynchronous variants of the new algorithm. We provide the stability analysis of the asynchronous variant in the round-robin scheme and compare it with the more common parallelized method ADMM. We show that the stability of EASGD is guaranteed when a simple stability condition is satisfied, which is not the case for ADMM. We additionally propose the momentum-based version of our algorithm that can be applied in both synchronous and asynchronous settings. Asynchronous variant of the algorithm is applied to train convolutional neural networks for image classification on the CIFAR and ImageNet datasets. Experiments demonstrate that the new algorithm accelerates the training of deep architectures compared to DOWNPOUR and other common baseline approaches and furthermore is very communication efficient.}, - pubstate = {prepublished}, - keywords = {Computer Science - Machine Learning,Statistics - Machine Learning}, - file = {/Users/alexandercai/Library/CloudStorage/GoogleDrive-alexcai@college.harvard.edu/My Drive/Vault/papers/assets/2015/Deep learning with Elastic Averaging SGD (2015) - Zhang, Choromanska, LeCun.pdf;/Users/alexandercai/Zotero/storage/M4LFKVWK/1412.html} -} diff --git a/main.pdf b/main.pdf deleted file mode 100644 index d3fcf60..0000000 Binary files a/main.pdf and /dev/null differ diff --git a/main.tex b/main.tex deleted file mode 100644 index ccd2024..0000000 --- a/main.tex +++ /dev/null @@ -1,62 +0,0 @@ -\providecommand{\main}{.} % see https://tex.stackexchange.com/questions/289450/path-of-figures-in-different-directories-with-subfile-latex - -\documentclass[12pt,twoside,letterpaper,openany]{book} - -\input{preamble} -\input{glossary} - -\graphicspath{ {\main/assets/} } - -\setcounter{tocdepth}{4} - -\usepackage{subfiles} - -\begin{document} - -\frontmatter - -\begin{titlepage} - - \begingroup\Huge - \noindent - CS/Stat 184 - - \noindent - Introduction to Reinforcement Learning - \endgroup - - \vspace{\stretch{1}} - - \noindent - Initially created by \textbf{Alexander D. Cai} during the first iteration of - the course in Fall 2022. - - \tableofcontents - -\end{titlepage} - -\subfile{\main/0_frontmatter/frontmatter.tex} - -\mainmatter - -\subfile{\main/1_bandits/bandits} - -\subfile{\main/2_mdps/mdps} - -\subfile{\main/3_control/control} - -\subfile{\main/4_pg/pg} - -\subfile{\main/5_fitted_dp/fitted_dp} - -\subfile{\main/6_exploration_mdps/exploration} - -\backmatter - -\subfile{\main/appendix/appendix} - -\printglossaries - -\printbibliography - -\end{document} diff --git a/preamble.tex b/preamble.tex deleted file mode 100644 index 60181aa..0000000 --- a/preamble.tex +++ /dev/null @@ -1,125 +0,0 @@ -\usepackage[english]{babel} -\usepackage[utf8]{inputenc} -\usepackage[margin=1in, twoside]{geometry} -\usepackage[indent]{parskip} -\usepackage{graphicx, subcaption, sidenotes, enumitem, tabularx, todonotes, listings, biblatex, tikz, algpseudocode, csquotes} - - -% ==================== FORMATTING ==================== - -\renewcommand{\familydefault}{\sfdefault} -\raggedbottom % prevent stretching on mostly blank pages - -\addbibresource{\main/main.bib} - - -% ==================== HEADER STYLES ==================== - -\usepackage{fancyhdr, titlesec} - -\setlength{\headheight}{16pt} -\pagestyle{fancy} - -\fancyhf{} % clear all existing fields -\fancyhead[L]{\leftmark} -\fancyhead[R]{\rightmark} -\fancyfoot[RO, LE]{\thepage} - -\titleformat - {\section} - {\vspace{2em}\titlerule[2pt]\bfseries\Large} - {\thesection} - {2em} - {} - - -% ==================== MATH NOTATION ==================== - -% use tcolorbox theorems instead of amsthm -\usepackage{amsmath, amsfonts, amssymb} - -\let\P\relax -\DeclareMathOperator*{\P}{\mathbb{P}} -\DeclareMathOperator*{\E}{\mathbb{E}} -\DeclareMathOperator*{\argmin}{\textrm{arg\ min}} -\DeclareMathOperator*{\argmax}{\textrm{arg\ max}} -\DeclareMathOperator*{\var}{\textrm{Var}} -\DeclareMathOperator*{\tr}{\textrm{Tr}} -\DeclareMathOperator*{\diag}{\textrm{diag}} - -% semantic commands - -\newcommand{\R}{\mathbb{R}} -\newcommand{\bop}{\mathcal{J}} -\newcommand{\cN}{\mathcal{N}} -\renewcommand{\S}{\mathcal{S}} -\newcommand{\A}{\mathcal{A}} -\newcommand{\qed}{$\blacksquare$} -\newcommand{\kl}[2]{\mathrm{KL}\left(#1 \parallel #2\right)} -\newcommand{\ind}[1]{\mathbf{1}\left\{#1\right\}} -\newcommand{\hor}{H} -\newcommand{\lgr}{\mathcal{L}} -\newcommand{\hi}{h} -\newcommand{\st}{s} -\newcommand{\act}{a} -\newcommand{\Vopt}{V^\star} -\newcommand{\Qopt}{Q^\star} -\newcommand{\Nex}{N_{\text{explore}}} -\newcommand{\muv}{\boldsymbol{\mu}} -\renewcommand{\tilde}{\widetilde} -\renewcommand{\hat}{\widehat} - -\allowdisplaybreaks - - -% ==================== TCOLORBOX CONFIGURATION ==================== - -\usepackage[many]{tcolorbox} - -\newcommand{\customtheorem}[4]{ - \newcounter{#1}[section] - \expandafter\newcommand\csname #1name\endcsname{#2} - \expandafter\renewcommand\csname the#1\endcsname{\thesection.\arabic{#1}} - \newtcbtheorem[use counter*=#1] - {#1} - {#2} - { - fonttitle=\bfseries, - colback=#4, - coltitle=black, - enhanced, - breakable, - parbox=false, - boxed title style={colback=#4}, - attach boxed title to top text left={ - yshift=-\tcboxedtitleheight/2, - yshifttext=-\tcboxedtitleheight/2} - }{#3} -} - -\customtheorem{example}{Example}{eg}{violet!5} % examples -\customtheorem{derivation}{Derivation}{dr}{white} % long calculations -\customtheorem{definition}{Definition}{df}{blue!5} % important new terms and algorithms -\customtheorem{theorem}{Theorem}{th}{green!5} % theorems - -\newenvironment{exercise}{\textbf{Exercise:}}{} -\newenvironment{remark}{\textbf{Remark:}}{} - - -% ==================== OTHER COMMANDS ==================== - -\newcommand{\rltable}[3]{\begin{center} - \begin{tabular}{|c|c|c|c|} - \hline - \textbf{States} & \textbf{Actions} &\textbf{Rewards} \\ - \hline - #1 & #2 & #3 \\ - \hline - \end{tabular} -\end{center}} - -\newenvironment{steps} - {\begingroup \samepage \begin{enumerate}[label={\bfseries Step \arabic{*}.}]} - {\end{enumerate} \endgroup} - -\usepackage{hyperref} % must come after titlesec \ No newline at end of file