From c0f946acd45fedfa678b785a66a4a030f47c3867 Mon Sep 17 00:00:00 2001 From: Alexander Cai Date: Fri, 6 Sep 2024 12:51:46 -0400 Subject: [PATCH] general refactoring --- Makefile | 16 +- book/_toc.yml | 13 +- book/appendix.md | 6 - book/background.md | 37 ++++ book/bandits.md | 247 ++++++--------------------- book/challenges.md | 76 --------- book/contextual_bandits.md | 168 ++++++++++++++++++ book/exploration.md | 3 +- book/fitted_dp.md | 104 +----------- book/imitation_learning.md | 1 + book/index.md | 176 ++++++++++++++++++- book/intro.md | 153 ----------------- book/mdps.md | 307 +++++++++++++++++---------------- book/pg.md | 149 +++++++++++++--- book/planning.md | 15 ++ book/shared/npg_line.png | Bin 0 -> 32010 bytes book/shared/references.bib | 329 ++++++++++++++++++++---------------- book/shared/trajectory.png | Bin 0 -> 36453 bytes book/supervised_learning.md | 120 +++++++++++++ environment.yml | 2 +- graphs.md | 23 +++ 21 files changed, 1074 insertions(+), 871 deletions(-) delete mode 100644 book/appendix.md create mode 100644 book/background.md delete mode 100644 book/challenges.md create mode 100644 book/contextual_bandits.md delete mode 100644 book/intro.md create mode 100644 book/planning.md create mode 100644 book/shared/npg_line.png create mode 100644 book/shared/trajectory.png create mode 100644 book/supervised_learning.md create mode 100644 graphs.md diff --git a/Makefile b/Makefile index 3b20ae7..c2664a7 100644 --- a/Makefile +++ b/Makefile @@ -2,18 +2,17 @@ ENV_NAME = rlbook RUN = micromamba run -n $(ENV_NAME) -_NOTEBOOKS = $(addprefix book/, intro bandits mdps fitted_dp control pg exploration) - -NOTEBOOKS = $(addsuffix .md, $(_NOTEBOOKS)) - -IPYNBS = $(addsuffix .ipynb, $(_NOTEBOOKS)) +_NOTEBOOKS = $(addprefix book/, bandits contextual_bandits control exploration fitted_dp imitation_learning mdps pg planning supervised_learning) _META = \ - appendix \ + background \ bibliography \ - challenges \ index +NOTEBOOKS = $(addsuffix .md, $(_NOTEBOOKS)) + +IPYNBS = $(addsuffix .ipynb, $(_NOTEBOOKS)) + META = $(addsuffix .md, $(addprefix book/, $(_META))) SOLUTIONS = book/solutions/bandits.py @@ -50,3 +49,6 @@ lab: lint: $(RUN) ruff check --fix $(IPYNBS) + +publish: book/_build/html + $(RUN) ghp-import --cname "rlbook.adzc.ai" --no-jekyll --push --force book/_build/html diff --git a/book/_toc.yml b/book/_toc.yml index 8ecb5f0..018f641 100644 --- a/book/_toc.yml +++ b/book/_toc.yml @@ -6,14 +6,15 @@ root: index.md options: numbered: true chapters: - - file: intro.md - - file: bandits.md - file: mdps.md - - file: fitted_dp.md - file: control.md + - file: bandits.md + - file: supervised_learning.md + - file: fitted_dp.md - file: pg.md - - file: exploration.md - file: imitation_learning.md -# - file: challenges -# - file: appendix + - file: planning.md + - file: exploration.md + - file: contextual_bandits.md - file: bibliography.md + - file: background.md diff --git a/book/appendix.md b/book/appendix.md deleted file mode 100644 index d65e599..0000000 --- a/book/appendix.md +++ /dev/null @@ -1,6 +0,0 @@ -# Derivations - -## Natural policy gradient - -The TRPO objective is -$$\max_\theta \E_{s_0, \dots, s_{H-1} \sim \rho_{\theta^k}}$$ diff --git a/book/background.md b/book/background.md new file mode 100644 index 0000000..5b691d1 --- /dev/null +++ b/book/background.md @@ -0,0 +1,37 @@ +(background)= +# Appendix: Background + +## O notation + +Throughout this chapter and the rest of the book, we will describe the +asymptotic behavior of a function using $O$ notation. + +For two functions $f(t)$ and $g(t)$, we say that $f(t) \le O(g(t))$ if +$f$ is asymptotically upper bounded by $g$. Formally, this means that +there exists some constant $C > 0$ such that $f(t) \le C \cdot g(t)$ for +all $t$ past some point $t_0$. + +We say $f(t) < o(g(t))$ if asymptotically $f$ grows strictly slower than +$g$. Formally, this means that for *any* scalar $C > 0$, there exists +some $t_0$ such that $f(t) \le C \cdot g(t)$ for all $t > t_0$. +Equivalently, we say $f(t) < o(g(t))$ if +$\lim_{t \to \infty} f(t)/g(t) = 0$. + +$f(t) = \Theta(g(t))$ means that $f$ and $g$ grow at the same rate +asymptotically. That is, $f(t) \le O(g(t))$ and $g(t) \le O(f(t))$. + +Finally, we use $f(t) \ge \Omega(g(t))$ to mean that $g(t) \le O(f(t))$, +and $f(t) > \omega(g(t))$ to mean that $g(t) < o(f(t))$. + +We also use the notation $\tilde O(g(t))$ to hide logarithmic factors. +That is, $f(t) = \tilde O(g(t))$ if there exists some constant $C$ such +that $f(t) \le C \cdot g(t) \cdot \log^k(t)$ for some $k$ and all $t$. + +Occasionally, we will also use $O(f(t))$ (or one of the other symbols) +as shorthand to manipulate function classes. For example, we might write +$O(f(t)) + O(g(t)) = O(f(t) + g(t))$ to mean that the sum of two +functions in $O(f(t))$ and $O(g(t))$ is in $O(f(t) + g(t))$. + +## Python + + diff --git a/book/bandits.md b/book/bandits.md index 77910cb..331bf0e 100644 --- a/book/bandits.md +++ b/book/bandits.md @@ -14,7 +14,7 @@ kernelspec: (bandits)= # Multi-Armed Bandits -```{code-cell} +```{code-cell} ipython3 :tags: [hide-input] from jaxtyping import Float, Array @@ -22,7 +22,6 @@ import numpy as np # from bokeh.plotting import figure, show, output_notebook import latexify -from abc import ABC, abstractmethod # "Abstract Base Class" from typing import Callable, Union import matplotlib.pyplot as plt @@ -36,10 +35,12 @@ plt.style.use("fivethirtyeight") def random_argmax(ary: Array) -> int: + """Take an argmax and randomize between ties.""" max_idx = np.flatnonzero(ary == ary.max()) return np.random.choice(max_idx).item() +# used as decorator latex = latexify.algorithmic( prefixes={"mab"}, identifiers={"arm": "a_t", "reward": "r", "means": "mu"}, @@ -82,7 +83,7 @@ The name “multi-armed bandits” comes from slot machines in casinos, which ar Let $K$ denote the number of arms. We’ll label them $0, \dots, K-1$ and use *superscripts* to indicate the arm index; since we seldom need to raise a number to a power, this won’t cause much confusion. In this chapter, we’ll consider the **Bernoulli bandit** setting from the examples above, where arm $k$ either returns reward $1$ with probability $\mu^k$ or $0$ otherwise. The agent gets to pull an arm $T$ times in total. We can formalize the Bernoulli bandit in the following Python code: -```{code-cell} +```{code-cell} ipython3 class MAB: """ The Bernoulli multi-armed bandit environment. @@ -104,14 +105,14 @@ class MAB: return +reward ``` -```{code-cell} +```{code-cell} ipython3 mab = MAB(means=np.array([0.1, 0.8, 0.4]), T=100) ``` In pseudocode, the agent’s interaction with the MAB environment can be described by the following process: -```{code-cell} +```{code-cell} ipython3 @latex def mab_loop(mab: MAB, agent: "Agent") -> int: for t in range(mab.T): @@ -125,8 +126,8 @@ mab_loop The `Agent` class stores the pull history and uses it to decide which arm to pull next. Since we are working with Bernoulli bandits, we can summarize the pull history concisely in a $\mathbb{N}^{K \times 2}$ array. -```{code-cell} -class Agent(ABC): +```{code-cell} ipython3 +class Agent: def __init__(self, K: int, T: int): """The MAB agent that decides how to choose an arm given the past history.""" self.K = K @@ -135,14 +136,12 @@ class Agent(ABC): self.choices = [] self.history = np.zeros((K, 2), dtype=int) - @abstractmethod def choose_arm(self) -> int: """Choose an arm of the MAB. Algorithm-specific.""" ... - @property def count(self) -> int: - """The number of pulls made.""" + """The number of pulls made. Also the current step index.""" return len(self.rewards) def update_history(self, arm: int, reward: int): @@ -170,7 +169,7 @@ $$ $$ :::: -```{code-cell} +```{code-cell} ipython3 def regret_per_step(mab: MAB, agent: Agent): """Get the difference from the average reward of the optimal arm. The sum of these is the regret.""" return [mab.means[mab.best_arm] - mab.means[arm] for arm in agent.choices] @@ -198,7 +197,7 @@ We’d like to achieve **sublinear regret** in expectation, i.e. $\E[\text{Regre The rest of the chapter comprises a series of increasingly sophisticated MAB algorithms. -```{code-cell} +```{code-cell} ipython3 :tags: [hide-input] def plot_strategy(mab: MAB, agent: Agent): @@ -226,7 +225,7 @@ def plot_strategy(mab: MAB, agent: Agent): A trivial strategy is to always choose arms at random (i.e. "pure exploration"). -```{code-cell} +```{code-cell} ipython3 :label: pure_exploration class PureExploration(Agent): @@ -252,7 +251,7 @@ $$ This scales as $\Theta(T)$, i.e. *linear* in the number of timesteps $T$. There’s no learning here: the agent doesn’t use any information about the environment to improve its strategy. You can see that the distribution over its arm choices always appears "(uniformly) random". -```{code-cell} +```{code-cell} ipython3 agent = PureExploration(mab.K, mab.T) mab_loop(mab, agent) plot_strategy(mab, agent) @@ -264,7 +263,7 @@ How might we improve on pure exploration? Instead, we could try each arm once, and then commit to the one with the highest observed reward. We’ll call this the **pure greedy** strategy. -```{code-cell} +```{code-cell} ipython3 :label: pure_greedy class PureGreedy(Agent): @@ -297,7 +296,7 @@ $$ Which is still $\Theta(T)$, the same as pure exploration! -```{code-cell} +```{code-cell} ipython3 agent = PureGreedy(mab.K, mab.T) mab_loop(mab, agent) plot_strategy(mab, agent) @@ -313,7 +312,7 @@ The cumulative regret is a straight line because the regret only depends on the We can improve the pure greedy algorithm as follows: let’s reduce the variance of the reward estimates by pulling each arm $N_{\text{explore}}> 1$ times before committing. This is called the **explore-then-commit** strategy. Note that the “pure greedy” strategy above is just the special case where $N_{\text{explore}}= 1$. -```{code-cell} +```{code-cell} ipython3 class ExploreThenCommit(Agent): def __init__(self, K: int, T: int, N_explore: int): super().__init__(K, T) @@ -323,7 +322,7 @@ class ExploreThenCommit(Agent): return solutions.etc_choose_arm(self) ``` -```{code-cell} +```{code-cell} ipython3 agent = ExploreThenCommit(mab.K, mab.T, mab.T // 15) mab_loop(mab, agent) plot_strategy(mab, agent) @@ -466,18 +465,23 @@ beforehand – we can instead interleave exploration and exploitation by, at each timestep, choosing a random action with some probability. We call this the **epsilon-greedy** algorithm. -```{code-cell} +```{code-cell} ipython3 class EpsilonGreedy(Agent): - def __init__(self, K: int, T: int, get_epsilon: Callable[[int], float]): + def __init__( + self, + K: int, + T: int, + ε_array: Float[Array, " T"], + ): super().__init__(K, T) - self.get_epsilon = get_epsilon + self.ε_array = ε_array def choose_arm(self): return solutions.epsilon_greedy_choose_arm(self) ``` -```{code-cell} -agent = EpsilonGreedy(mab.K, mab.T, lambda t: 0.1) +```{code-cell} ipython3 +agent = EpsilonGreedy(mab.K, mab.T, np.full(mab.T, 0.1)) mab_loop(mab, agent) plot_strategy(mab, agent) ``` @@ -566,9 +570,14 @@ This bound would then suffice for applying the UCB algorithm! That is, the upper $$M^k_t := \hat \mu^k_t + \sqrt{\frac{\ln(2t/\delta')}{2N^k_t}},$$ -where we can choose $\delta'$ depending on how tight we want the interval to be. A smaller $\delta'$ would give us a larger and higher-confidence interval, and vice versa. We can now use this to define the UCB algorithm. +where we can choose $\delta'$ depending on how tight we want the interval to be. + +- A smaller $\delta'$ would give us a larger and higher-confidence interval, emphasizing the exploration term. +- A larger $\delta'$ would give a tighter and lower-confidence interval, prioritizing the current sample averages. + +We can now use this to define the UCB algorithm. -```{code-cell} +```{code-cell} ipython3 class UCB(Agent): def __init__(self, K: int, T: int, delta: float): super().__init__(K, T) @@ -589,8 +598,8 @@ Intuitively, UCB prioritizes arms where: As desired, this explores in a smarter, *adaptive* way compared to the previous algorithms. Does it achieve lower regret? -```{code-cell} -agent = UCB(mab.K, mab.T, 0.05) +```{code-cell} ipython3 +agent = UCB(mab.K, mab.T, 0.9) mab_loop(mab, agent) plot_strategy(mab, agent) ``` @@ -650,8 +659,8 @@ $$ \end{aligned} $$ -In fact, we can do a more sophisticated analysis to trim off a factor of -$\sqrt{K}$ and show $\text{Regret}_T = \tilde O(\sqrt{TK})$. +In fact, we can do a more sophisticated analysis to trim off a factor of $\sqrt{K}$ +and show $\text{Regret}_T = \tilde O(\sqrt{TK})$. +++ @@ -690,16 +699,18 @@ From this Bayesian perspective, the **Thompson sampling** algorithm follows naturally: just sample from the distribution of the optimal arm, given the observations! -```{code-cell} -class Distribution(ABC): - @abstractmethod - def sample(self) -> Float[Array, " K"]: ... +```{code-cell} ipython3 +class Distribution: + def sample(self) -> Float[Array, " K"]: + """Sample a vector of means for the K arms.""" + ... - @abstractmethod - def update(self, arm: int, reward: float): ... + def update(self, arm: int, reward: float): + """Condition on obtaining `reward` from the given arm.""" + ... ``` -```{code-cell} +```{code-cell} ipython3 class ThompsonSampling(Agent): def __init__(self, K: int, T: int, prior: Distribution): super().__init__(K, T) @@ -753,7 +764,7 @@ distribution upon observing a reward, rather than having to recompute the entire posterior distribution from scratch. ::: -```{code-cell} +```{code-cell} ipython3 class Beta(Distribution): def __init__(self, K: int, alpha: int = 1, beta: int = 1): self.alphas = np.full(K, alpha) @@ -767,7 +778,7 @@ class Beta(Distribution): self.betas[arm] += 1 - reward ``` -```{code-cell} +```{code-cell} ipython3 beta_distribution = Beta(mab.K) agent = ThompsonSampling(mab.K, mab.T, beta_distribution) mab_loop(mab, agent) @@ -792,161 +803,7 @@ the *constant factor* is optimal as well. +++ -## Contextual bandits - -In the above MAB environment, the reward distributions of the arms -remain constant. However, in many real-world settings, we might receive -additional information that affects these distributions. For example, in -the online advertising case where each arm corresponds to an ad we could -show the user, we might receive information about the user's preferences -that changes how likely they are to click on a given ad. We can model -such environments using **contextual bandits**. - -:::{prf:definition} Contextual bandit -:label: contextual_bandit - -At each timestep $t$, a new *context* -$x_t$ is drawn from some distribution $\nu_{\text{x}}$. The learner gets -to observe the context, and choose an action $a_t$ according to some -context-dependent policy $\pi_t(x_t)$. Then, the learner observes the -reward from the chosen arm $r_t \sim \nu^{a_t}(x_t)$. The reward -distribution also depends on the context. -::: - -+++ - -Assuming our context is *discrete*, we can just perform the same -algorithms, treating each context-arm pair as its own arm. This gives us -an enlarged MAB of $K |\mathcal{X}|$ arms. - -:::{attention} -Write down the UCB algorithm for this enlarged MAB. That is, write an -expression for $\pi_t(x_t) = \argmax_a \dots$. -::: - -Recall that running UCB for $T$ timesteps on an MAB with $K$ arms -achieves a regret bound of $\tilde{O}(\sqrt{TK})$. So in this problem, -we would achieve regret $\tilde{O}(\sqrt{TK|\mathcal{X}|})$ in the -contextual MAB, which has a polynomial dependence on $|\mathcal{X}|$. -But in a situation where we have large, or even infinitely many -contexts, e.g. in the case where our context is a continuous value, this -becomes intractable. - -Note that this "enlarged MAB" treats the different contexts as entirely -unrelated to each other, while in practice, often contexts are *related* -to each other in some way: for example, we might want to advertise -similar products to users with similar preferences. How can we -incorporate this structure into our solution? - -+++ - -(lin_ucb)= -### Linear contextual bandits - -We want to model the *mean reward* of arm $k$ as a function of the -context, i.e. $\mu^k(x)$. One simple model is the *linear* one: -$\mu^k(x) = x^\top \theta^k$, where $x \in \mathcal{X} = \mathbb{R}^d$ and -$\theta^k \in \mathbb{R}^d$ describes a *feature direction* for arm $k$. Recall -that **supervised learning** gives us a way to estimate a conditional -expectation from samples: We learn a *least squares* estimator from the -timesteps where arm $k$ was selected: -$$\hat \theta_t^k = \argmin_{\theta \in \mathbb{R}^d} \sum_{\{ i \in [t] : a_i = k \}} (r_i - x_i^\top \theta)^2.$$ -This has the closed-form solution known as the *ordinary least squares* -(OLS) estimator: - -:::{math} -:label: ols_bandit - -\begin{aligned} - \hat \theta_t^k & = (A_t^k)^{-1} \sum_{\{ i \in [t] : a_i = k \}} x_i r_i \\ - \text{where} \quad A_t^k & = \sum_{\{ i \in [t] : a_i = k \}} x_i x_i^\top. -\end{aligned} -::: - -We can now apply the UCB algorithm in this environment in order to -balance *exploration* of new arms and *exploitation* of arms that we -believe to have high reward. But how should we construct the upper -confidence bound? Previously, we treated the pulls of an arm as i.i.d. -samples and used Hoeffding's inequality to bound the distance of the -sample mean, our estimator, from the true mean. However, now our -estimator is not a sample mean, but rather the OLS estimator above {eq}`ols_bandit`. Instead, we'll use **Chebyshev's -inequality** to construct an upper confidence bound. - -:::{prf:theorem} Chebyshev's inequality -:label: chebyshev - -For a random variable $Y$ such that -$\E Y = 0$ and $\E Y^2 = \sigma^2$, -$$|Y| \le \beta \sigma \quad \text{with probability} \ge 1 - \frac{1}{\beta^2}$$ -::: - -Since the OLS estimator is known to be unbiased (try proving this -yourself), we can apply Chebyshev's inequality to -$x_t^\top (\hat \theta_t^k - \theta^k)$: - -$$\begin{aligned} - x_t^\top \theta^k \le x_t^\top \hat \theta_t^k + \beta \sqrt{x_t^\top (A_t^k)^{-1} x_t} \quad \text{with probability} \ge 1 - \frac{1}{\beta^2} -\end{aligned}$$ - -:::{attention} -We haven't explained why $x_t^\top (A_t^k)^{-1} x_t$ is the correct -expression for the variance of $x_t^\top \hat \theta_t^k$. This result -follows from some algebra on the definition of the OLS estimator {eq}`ols_bandit`. -::: - -The first term is exactly our predicted reward $\hat \mu^k_t(x_t)$. To -interpret the second term, note that -$$x_t^\top (A_t^k)^{-1} x_t = \frac{1}{N_t^k} x_t^\top (\Sigma_t^k)^{-1} x_t,$$ -where -$$\Sigma_t^k = \frac{1}{N_t^k} \sum_{\{ i \in [t] : a_i = k \}} x_i x_i^\top$$ -is the empirical covariance matrix of the contexts (assuming that the -context has mean zero). That is, the learner is encouraged to choose -arms when $x_t$ is *not aligned* with the data seen so far, or if arm -$k$ has not been explored much and so $N_t^k$ is small. - -We can now substitute these quantities into UCB to get the **LinUCB** -algorithm: - -```{code-cell} -class LinUCBPseudocode(Agent): - def __init__( - self, K: int, T: int, D: int, lam: float, get_c: Callable[[int], float] - ): - super().__init__(K, T) - self.lam = lam - self.get_c = get_c - self.contexts = [None for _ in range(K)] - self.A = np.repeat(lam * np.eye(D)[...], K) - self.targets = np.zeros(K, D) - self.w = np.zeros(K, D) - - def choose_arm(self, context: Float[Array, " D"]): - c = self.get_c(self.count) - scores = self.w @ context + c * np.sqrt( - context.T @ np.linalg.solve(self.A, context) - ) - return random_argmax(scores) - - def update_history(self, context: Float[Array, " D"], arm: int, reward: int): - self.A[arm] += np.outer(context, context) - self.targets[arm] += context * reward - self.w[arm] = np.linalg.solve(self.A[arm], self.targets[arm]) -``` - -:::{attention} -Note that the matrix $A_t^k$ above might not be invertible. When does this occur? One way to address this is to include a $\lambda I$ regularization term to ensure that $A_t^k$ is invertible. This is equivalent to solving a *ridge regression* problem instead of the unregularized least squares problem. Implement this solution. TODO SOLUTION CURRENTLY SHOWN -::: - -+++ - -$c_t$ is similar to the $\log (2t/\delta')$ term of UCB: It controls the -width of the confidence interval. Here, we treat it as a tunable -parameter, though in a theoretical analysis, it would depend on $A_t^k$ -and the probability $\delta$ with which the bound holds. - -Using similar tools for UCB, we can also prove an $\tilde{O}(\sqrt{T})$ -regret bound. The full details of the analysis can be found in Section 3 of {cite}`agarwal_reinforcement_2022`. - -+++ - ## Summary + +In this chapter, +we explored the **multi-armed bandit** setting for analyzing sequential decision-making in an unknown environment. diff --git a/book/challenges.md b/book/challenges.md deleted file mode 100644 index c8a1af1..0000000 --- a/book/challenges.md +++ /dev/null @@ -1,76 +0,0 @@ -# Guarantees for policy gradient methods - -What kinds of problems are policy gradient methods good at solving? - -For example, consider a very simple MDP in which - -outline - -\# sparse reward setting - -e.g. l/r, move to end; random policy 1/(2\^n) - -no rewards early on, so no gradients - -\## possible solutions - -if simulator: use better starting - -imitation learning (today) - -exploration - ucb-vi - -reward shaping - -\# guarantees for pg - -sl works in many settings - -want to show that some benefits extend to rl - -eg sample efficiency needed for softmax (log linear) policy - -\- eg under npg - -what features do we need for good learning? (approximation error between -ground truth and our function class) - -hopefully samples poly(dim(φ), 1/eps) - -need some coverage over state space - ---- - -but convergence guarantees are hard - -\# imitation learning - -eg how humans learn by imitating experts - -access to expert demonstrations - -use sl to create a policy - -input: senses output: action - -\## setting - -unknown reward function - -assume expert has good policy - -goal is to learn a policy as good as expert - ---- - -\# BC - -e.g. maximum likelihood (stochastic) - -or classification error (deterministic) - -or squared error for continuous actions - ---- - -theorem: il is almost as easy as sl diff --git a/book/contextual_bandits.md b/book/contextual_bandits.md new file mode 100644 index 0000000..1afe425 --- /dev/null +++ b/book/contextual_bandits.md @@ -0,0 +1,168 @@ +--- +jupytext: + text_representation: + extension: .md + format_name: myst + format_version: 0.13 + jupytext_version: 1.16.2 +kernelspec: + display_name: Python 3 (ipykernel) + language: python + name: python3 +--- + +(contextual_bandits)= +# Contextual bandits + +In the above MAB environment, the reward distributions of the arms +remain constant. However, in many real-world settings, we might receive +additional information that affects these distributions. For example, in +the online advertising case where each arm corresponds to an ad we could +show the user, we might receive information about the user's preferences +that changes how likely they are to click on a given ad. We can model +such environments using **contextual bandits**. + +:::{prf:definition} Contextual bandit +:label: contextual_bandit + +At each timestep $t$, a new *context* +$x_t$ is drawn from some distribution $\nu_{\text{x}}$. The learner gets +to observe the context, and choose an action $a_t$ according to some +context-dependent policy $\pi_t(x_t)$. Then, the learner observes the +reward from the chosen arm $r_t \sim \nu^{a_t}(x_t)$. The reward +distribution also depends on the context. +::: + ++++ + +Assuming our context is *discrete*, we can just perform the same +algorithms, treating each context-arm pair as its own arm. This gives us +an enlarged MAB of $K |\mathcal{X}|$ arms. + +:::{attention} +Write down the UCB algorithm for this enlarged MAB. That is, write an +expression for $\pi_t(x_t) = \argmax_a \dots$. +::: + +Recall that running UCB for $T$ timesteps on an MAB with $K$ arms +achieves a regret bound of $\tilde{O}(\sqrt{TK})$. So in this problem, +we would achieve regret $\tilde{O}(\sqrt{TK|\mathcal{X}|})$ in the +contextual MAB, which has a polynomial dependence on $|\mathcal{X}|$. +But in a situation where we have large, or even infinitely many +contexts, e.g. in the case where our context is a continuous value, this +becomes intractable. + +Note that this "enlarged MAB" treats the different contexts as entirely +unrelated to each other, while in practice, often contexts are *related* +to each other in some way: for example, we might want to advertise +similar products to users with similar preferences. How can we +incorporate this structure into our solution? + ++++ + +(lin_ucb)= +## Linear contextual bandits + +We want to model the *mean reward* of arm $k$ as a function of the +context, i.e. $\mu^k(x)$. One simple model is the *linear* one: +$\mu^k(x) = x^\top \theta^k$, where $x \in \mathcal{X} = \mathbb{R}^d$ and +$\theta^k \in \mathbb{R}^d$ describes a *feature direction* for arm $k$. Recall +that **supervised learning** gives us a way to estimate a conditional +expectation from samples: We learn a *least squares* estimator from the +timesteps where arm $k$ was selected: +$$\hat \theta_t^k = \argmin_{\theta \in \mathbb{R}^d} \sum_{\{ i \in [t] : a_i = k \}} (r_i - x_i^\top \theta)^2.$$ +This has the closed-form solution known as the *ordinary least squares* +(OLS) estimator: + +:::{math} +:label: ols_bandit + +\begin{aligned} + \hat \theta_t^k & = (A_t^k)^{-1} \sum_{\{ i \in [t] : a_i = k \}} x_i r_i \\ + \text{where} \quad A_t^k & = \sum_{\{ i \in [t] : a_i = k \}} x_i x_i^\top. +\end{aligned} +::: + +We can now apply the UCB algorithm in this environment in order to +balance *exploration* of new arms and *exploitation* of arms that we +believe to have high reward. But how should we construct the upper +confidence bound? Previously, we treated the pulls of an arm as i.i.d. +samples and used Hoeffding's inequality to bound the distance of the +sample mean, our estimator, from the true mean. However, now our +estimator is not a sample mean, but rather the OLS estimator above {eq}`ols_bandit`. Instead, we'll use **Chebyshev's +inequality** to construct an upper confidence bound. + +:::{prf:theorem} Chebyshev's inequality +:label: chebyshev + +For a random variable $Y$ such that +$\E Y = 0$ and $\E Y^2 = \sigma^2$, +$$|Y| \le \beta \sigma \quad \text{with probability} \ge 1 - \frac{1}{\beta^2}$$ +::: + +Since the OLS estimator is known to be unbiased (try proving this +yourself), we can apply Chebyshev's inequality to +$x_t^\top (\hat \theta_t^k - \theta^k)$: + +$$\begin{aligned} + x_t^\top \theta^k \le x_t^\top \hat \theta_t^k + \beta \sqrt{x_t^\top (A_t^k)^{-1} x_t} \quad \text{with probability} \ge 1 - \frac{1}{\beta^2} +\end{aligned}$$ + +:::{attention} +We haven't explained why $x_t^\top (A_t^k)^{-1} x_t$ is the correct +expression for the variance of $x_t^\top \hat \theta_t^k$. This result +follows from some algebra on the definition of the OLS estimator {eq}`ols_bandit`. +::: + +The first term is exactly our predicted reward $\hat \mu^k_t(x_t)$. To +interpret the second term, note that +$$x_t^\top (A_t^k)^{-1} x_t = \frac{1}{N_t^k} x_t^\top (\Sigma_t^k)^{-1} x_t,$$ +where +$$\Sigma_t^k = \frac{1}{N_t^k} \sum_{\{ i \in [t] : a_i = k \}} x_i x_i^\top$$ +is the empirical covariance matrix of the contexts (assuming that the +context has mean zero). That is, the learner is encouraged to choose +arms when $x_t$ is *not aligned* with the data seen so far, or if arm +$k$ has not been explored much and so $N_t^k$ is small. + +We can now substitute these quantities into UCB to get the **LinUCB** +algorithm: + +```{code-cell} +class LinUCBPseudocode(Agent): + def __init__( + self, K: int, T: int, D: int, lam: float, get_c: Callable[[int], float] + ): + super().__init__(K, T) + self.lam = lam + self.get_c = get_c + self.contexts = [None for _ in range(K)] + self.A = np.repeat(lam * np.eye(D)[...], K) + self.targets = np.zeros(K, D) + self.w = np.zeros(K, D) + + def choose_arm(self, context: Float[Array, " D"]): + c = self.get_c(self.count) + scores = self.w @ context + c * np.sqrt( + context.T @ np.linalg.solve(self.A, context) + ) + return random_argmax(scores) + + def update_history(self, context: Float[Array, " D"], arm: int, reward: int): + self.A[arm] += np.outer(context, context) + self.targets[arm] += context * reward + self.w[arm] = np.linalg.solve(self.A[arm], self.targets[arm]) +``` + +:::{attention} +Note that the matrix $A_t^k$ above might not be invertible. When does this occur? One way to address this is to include a $\lambda I$ regularization term to ensure that $A_t^k$ is invertible. This is equivalent to solving a *ridge regression* problem instead of the unregularized least squares problem. Implement this solution. TODO SOLUTION CURRENTLY SHOWN +::: + ++++ + +$c_t$ is similar to the $\log (2t/\delta')$ term of UCB: It controls the +width of the confidence interval. Here, we treat it as a tunable +parameter, though in a theoretical analysis, it would depend on $A_t^k$ +and the probability $\delta$ with which the bound holds. + +Using similar tools for UCB, we can also prove an $\tilde{O}(\sqrt{T})$ +regret bound. The full details of the analysis can be found in Section 3 of {cite}`agarwal_reinforcement_2022`. diff --git a/book/exploration.md b/book/exploration.md index 7e4fd2a..24e4736 100644 --- a/book/exploration.md +++ b/book/exploration.md @@ -11,6 +11,7 @@ kernelspec: name: python3 --- +(exploration)= # Exploration in MDPs ## Introduction @@ -74,7 +75,7 @@ Performance of explore-then-exploitexplore_then_exploit_performance As long as e We also explored the exploration-exploitation tradeoff in the chapter on {ref}`bandits`. Recall tthat in the MAB setting, we have $K$ arms, each of which has an unknown reward distribution, and we want to learn which of the arms is *optimal*, i.e. has the highest mean reward. -One algorithm that struck a good balance between exploration and exploitation was the **upper confidence bound** algorithm {ref}`ucb`: For each arm, we construct a *confidence interval* for its true mean award, and then choose the arm with the highest upper confidence bound. In summary, $$k_{t+1} \gets \argmax_{k \in [K]} \frac{R^{k}_t}{N^{k}_t} + \sqrt{\frac{\ln(2t/\delta)}{2 N^{k}_t}}$$ where $N_t^k$ indicates the number of times arm $k$ has been pulled up until time $t$, $R_t^k$ indicates the total reward obtained by pulling arm $k$ up until time $t$, and $\delta > 0$ controls the width of the confidence interval. How might we extend UCB to the MDP case? +One algorithm that struck a good balance between exploration and exploitation was the **upper confidence bound** algorithm {ref}`ucb`: For each arm, we construct a *confidence interval* for its true mean award, and then choose the arm with the highest upper confidence bound. In summary, $$k_{t+1} \gets \arg\max_{k \in [K]} \frac{R^{k}_t}{N^{k}_t} + \sqrt{\frac{\ln(2t/\delta)}{2 N^{k}_t}}$$ where $N_t^k$ indicates the number of times arm $k$ has been pulled up until time $t$, $R_t^k$ indicates the total reward obtained by pulling arm $k$ up until time $t$, and $\delta > 0$ controls the width of the confidence interval. How might we extend UCB to the MDP case? Let us formally describe an unknown MDP as an MAB problem. In an unknown MDP, we want to learn which *policy* is optimal. So if we want to apply MAB techniques to solving an MDP, it makes sense to think of *arms* as *policies*. There are $K = (|\mathcal{A}|^{|\mathcal{S}|})^\hor$ deterministic policies in a finite MDP. Then, "pulling" arm $\pi$ corresponds to using $\pi$ to act through a trajectory in the MDP, and observing the total reward. diff --git a/book/fitted_dp.md b/book/fitted_dp.md index 3f6f601..1d79d1c 100644 --- a/book/fitted_dp.md +++ b/book/fitted_dp.md @@ -366,106 +366,4 @@ def fitted_policy_iteration( return π ``` -(supervised_learning)= -## Supervised learning - -This section will cover the details of implementing the `fit` function above: -That is, how to use a dataset of labelled samples $(x_1, y_1), \dots, (x_N, y_N)$ to find a function $f$ that minimizes the empirical risk. -This requires two ingredients: - -1. A **function class** $\mathcal{F}$ to search over -2. A **fitting method** for minimizing the empirical risk over this class - -The two main function classes we will cover are **linear models** and **neural networks**. -Both of these function classes are *parameterized* by some parameters $\theta$, -and the fitting method will search over these parameters to minimize the empirical risk: - -:::{prf:definition} Parameterized empirical risk minimization -:label: parameterized_empirical_risk_minimization - -Given a dataset of samples $(x_1, y_1), \dots, (x_N, y_N)$ and a class of functions $\mathcal{F}$ parameterized by $\theta$, -we to find a parameter (vector) $\hat \theta$ that minimizes the empirical risk: - -$$ -\hat \theta = \arg\min_{\theta} \frac{1}{N} \sum_{i=1}^N (y_i - f_\theta(x_i))^2 -$$ -::: - -The most common fitting method for parameterized models is **gradient descent**. - -:::{prf:algorithm} Gradient descent -Letting $L(\theta) \in \mathbb{R}$ denote the empirical risk in terms of the parameters, -the gradient descent algorithm updates the parameters according to the rule - -$$ -\theta^{t+1} = \theta^t - \eta \nabla_\theta L(\theta^t) -$$ - -where $\eta > 0$ is the **learning rate**. -::: - -```{code-cell} -Params = Float[Array, " D"] - - -def gradient_descent( - loss: Callable[[Params], float], - θ_init: Params, - η: float, - epochs: int, -): - """ - Run gradient descent to minimize the given loss function - (expressed in terms of the parameters). - """ - θ = θ_init - for _ in range(epochs): - θ = θ - η * grad(loss)(θ) - return θ -``` - -### Linear regression - -In linear regression, we assume that the function $f$ is linear in the parameters: - -$$ -\mathcal{F} = \{ x \mapsto \theta^\top x \mid \theta \in \mathbb{R}^D \} -$$ - -This function class is extremely simple and only contains linear functions. -To expand its expressivity, we can _transform_ the input $x$ using some feature function $\phi$, -i.e. $\widetilde x = \phi(x)$, and then fit a linear model in the transformed space instead. - -```{code-cell} -def fit_linear(X: Float[Array, "N D"], y: Float[Array, " N"], φ=lambda x: x): - """Fit a linear model to the given dataset using ordinary least squares.""" - X = vmap(φ)(X) - θ = np.linalg.lstsq(X, y, rcond=None)[0] - return lambda x: np.dot(φ(x), θ) -``` - -### Neural networks - -In neural networks, we assume that the function $f$ is a composition of linear functions (represented by matrices $W_i$) and non-linear activation functions (denoted by $\sigma$): - -$$ -\mathcal{F} = \{ x \mapsto \sigma(W_L \sigma(W_{L-1} \dots \sigma(W_1 x + b_1) \dots + b_{L-1}) + b_L) \} -$$ - -where $W_i \in \mathbb{R}^{D_{i+1} \times D_i}$ and $b_i \in \mathbb{R}^{D_{i+1}}$ are the parameters of the $i$-th layer, and $\sigma$ is the activation function. - -This function class is much more expressive and contains many more parameters. -This makes it more susceptible to overfitting on smaller datasets, -but also allows it to represent more complex functions. -In practice, however, neural networks exhibit interesting phenomena during training, -and are often able to generalize well even with many parameters. - -Another reason for their popularity is the efficient **backpropagation** algorithm -for computing the gradient of the empirical risk with respect to the parameters. -Essentially, the hierarchical structure of the neural network, i.e. computing the -output of the network as a composition of functions, allows us to use the chain rule -to compute the gradient of the output with respect to the parameters of each layer. - -{cite}`nielsen_neural_2015` provides a comprehensive introduction to neural networks and backpropagation. - -## Bias correction for Q-learning +## Summary diff --git a/book/imitation_learning.md b/book/imitation_learning.md index e893360..e0176e6 100644 --- a/book/imitation_learning.md +++ b/book/imitation_learning.md @@ -11,6 +11,7 @@ kernelspec: name: python3 --- +(imitation_learning)= # Imitation Learning Imagine you are tasked with learning how to drive. How do, or did, you go about it? diff --git a/book/index.md b/book/index.md index a2cf697..b712b13 100644 --- a/book/index.md +++ b/book/index.md @@ -1,7 +1,177 @@ # Introduction -This is an undergraduate textbook on reinforcement learning. +Welcome to the study of reinforcement learning! +This textbook accompanies the undergraduate course [CS 1840/STAT 184](http://lucasjanson.fas.harvard.edu/CS_Stat_184_0.html) taught at Harvard. +It is intended to be a friendly yet rigorous introduction to this active subfield of machine learning. -```{tableofcontents} -``` ++++ +## Prerequisites + +This book assumes the same prerequisites as the course: You should be familiar with multivariable calculus, linear algebra, and probability. +For Harvard undergraduates, this is fulfilled by Math 21a, Math 21b, and Stat 110, or their equivalents. +Stat 111 is strongly recommended but not required. +Specifically, we will assume that you know the following topics. The _italicized terms_ have brief re-introductions in the text or in the {ref}`background`: + +- **Linear Algebra:** Vectors and matrices, matrix multiplication, matrix + inversion, eigenvalues and eigenvectors. +- **Multivariable Calculus:** Partial derivatives, the chain rule, Taylor series, _gradients, directional derivatives, Lagrange multipliers._ +- **Probability:** Random variables, probability distributions, + expectation and variance, the law of iterated expectations (Adam's rule), covariance, conditional probability, Bayes's rule, and the law of total probability. + +You should also be comfortable with programming in Python. +See {ref}`programming` for more about this textbook's philosophy regarding programming. + ++++ + +## Reinforcement learning in a nutshell + +Broadly speaking, +RL studies **sequential decision-making** in **dynamic environments.** +An RL algorithm finds a **policy,** or strategy, that maximizes the **reward** it obtains from the environment. + +RL provides a powerful framework for attacking a wide variety of problems, +including robotic control, video games and board games, resource management, language modelling, and more. +It also provides an interdisciplinary paradigm for studying animal and human behavior. +Many of the most stunning results in machine learning, ranging from AlphaGo to ChatGPT, are built using RL algorithms. + +How does RL compare to the other two core machine learning paradigms, +**supervised learning** and **unsupervised learning?** + +- **Supervised learning** (SL) concerns itself with learning a mapping from inputs to outputs. + Typically the data takes the form of _statistically independent_ input-output pairs. + In RL, however, the data is generated by the agent interacting with the environment, + meaning the sequential observations of the state are _not independent_ from each other. + + Conversely, SL is a well-studied field that provides many useful tools for RL. + +- **Unsupervised learning** concerns itself with learning the _structure_ of data without the use of outside feedback or labels. + In RL, though, the agent receives a **reward signal** from the environment, + which can be thought of as a sort of feedback. + + Unsupervised learning is crucial in many real-world applications of RL for dimensionality reduction and other purposes. + ++++ + +## Core tasks in reinforcement learning + +What tasks, exactly, does RL comprise? +An RL algorithm must typically solve two main subtasks: + +- **Policy evaluation (prediction):** + How 'good' is a specific state, or state-action pair (under a given policy)? + That is, how much reward does it lead to in the long run? + +- **Policy optimization (control):** + Suppose we fully understand how the environment behaves. + What is the best action to take in every scenario? + + + + + ++++ + +## Course overview + +The course will progress through the following units: + +{ref}`mdps` introduces **Markov Decision Processes,** +the core mathematical framework for describing a large class of interactive environments. + +{ref}`lqr` is a standalone chapter on the **linear quadratic regulator** (LQR), +an important tool for *continuous control*, +in which the state and action spaces are no longer _finite_ but rather _continuous_. +This has widespread applications in robotics. + +{ref}`bandits` introduces the **multi-armed bandit** (MAB) model for _stateless_ sequential decision-making tasks. +In exploring a number of algorithms, +we will see how each of them strikes a different balance between _exploring_ new options and _exploiting_ known options. +This **exploration-exploitation tradeoff** is a core consideration in RL algorithm design. + +{ref}`supervised_learning` is a standalone crash course on some tools from supervised learning that we will use in later chapters. + +{ref}`fitted_dp` introduces **fitted dynamic programming** (fitted DP) algorithms for solving MDPs. +These algorithms use supervised learning to approximately evaluate policies when they cannot be evaluated exactly. + +{ref}`pg` explores an important class of algorithms based on iteratively improving a policy. +We will also encounter the use of _deep neural networks_ to express more complicated policies and approximate complicated functions. + +{ref}`imitation_learning` attempts to learn a good policy from expert demonstrations. +At its most basic, this is an application of supervised learning to RL tasks. + +{ref}`planning` looks at ways to _explicitly_ plan ahead when the environment's dynamics are known. +We will study the _Monte Carlo Tree Search_ heuristic, +which has been used to great success in the famous AlphaGo algorithm and its successors. + +{ref}`exploration` continues to investigate the exploration-exploitation tradeoff. +We will extend ideas from multi-armed bandits to the MDP setting. + +{ref}`contextual_bandits` extends the multi-armed bandit setting with some observed state. + +{ref}`background` contains an overview of selected background mathematical content and programming content. + + + ++++ + +## Notation + +We will use the following notation throughout the book. +This notation is inspired by {cite}`sutton_reinforcement_2018` and {cite}`agarwal_reinforcement_2022`. +We use $[N]$ as shorthand for the set $\{ 0, 1, \dots, N-1 \}$. + +| Element | Space | Definition (of element) | +|:------------:|:------------------------:|:--------------------------| +| $s$ | $\mathcal{S}$ | A state. | +| $a$ | $\mathcal{A}$ | An action. | +| $r$ | | A reward. | +| $\gamma$ | | A discount factor. | +| $\tau$ | $\mathcal{T}$ | A trajectory. | +| $\pi$ | $\Pi$ | A policy. | +| $V^\pi$ | $\mathcal{S} \to \mathbb{R}$ | The value function of policy $\pi$. | +| $Q^\pi$ | $\mathcal{S} \times \mathcal{A} \to \mathbb{R}$ | The action-value function (a.k.a. Q-function) of policy $\pi$. | +| $A^\pi$ | $\mathcal{S} \times \mathcal{A} \to \mathbb{R}$ | The advantage function of policy $\pi$. | +| | $\triangle(\mathcal{X})$ | A distribution supported on $\mathcal{X}$. | +| $\hi$ | $[\hor]$ | Time horizon index of an MDP (subscript). | +| $k$ | $[K]$ | Arm index of a multi-armed bandit (superscript). | +| $t$ | $[T]$ | Iteration index of an algorithm (subscript). | +| $\theta$ | $\Theta$ | A set of parameters. | + +Note that throughout the text, certain symbols will stand for either random variables or fixed values. +We aim to clarify in ambiguous settings. +Be warned that + ++++ + +(programming)= +## Programming + +Why include code in a textbook? +We believe that implementing an algorithm is a strong test of your understanding of it; +mathematical notation can often abstract away details, +while a computer must be given every single instruction. +We have sought to write readable Python code that is self-contained within each file. +This approach is inspired by {cite}`sussman_functional_2013`. +There are some ways in which the code style differs from typical software projects: + +- We keep use of language features to a minimum, + even if it leads to code that could otherwise be more concisely or idiomatically expressed. +- The variable names used in the code match those used in the main text. + For example, the variable `s` will be used instead of the more explicit `state`. + +We also make extensive use of Python _type annotations_ to explicitly specify variable types, including shapes of vectors and matrices using the [jaxtyping](https://github.com/patrick-kidger/jaxtyping) library. + +This is an interactive book built with [Jupyter Book](https://jupyterbook.org/en/stable/intro.html). +It uses [Python 3.11](https://docs.python.org/3.11/contents.html). +It uses the [JAX](https://jax.readthedocs.io/en/latest/index.html) library for numerical computing. +JAX was chosen for the clarity of its functional style and due to its mature RL ecosystem, +sustained in large part by the Google DeepMind research group and a large body of open-source contributors. +We use the standard [Gymnasium](https://gymnasium.farama.org/) library for interfacing with RL environments. diff --git a/book/intro.md b/book/intro.md deleted file mode 100644 index 7a006a6..0000000 --- a/book/intro.md +++ /dev/null @@ -1,153 +0,0 @@ ---- -jupytext: - text_representation: - extension: .md - format_name: myst - format_version: 0.13 - jupytext_version: 1.16.2 -kernelspec: - display_name: Python 3 (ipykernel) - language: python - name: python3 ---- - -# Introduction - -Welcome to the study of reinforcement learning! This textbook accompanies the undergraduate course [CS/STAT 184](https://shamulent.github.io/CS_Stat184_Fall23.html) taught at Harvard and is intended to be a friendly yet rigorous introduction to this active subfield of machine learning. Here are some questions you might have before embarking on this journey: - -+++ - -## Prerequisites - -This book assumes familiarity with multivariable calculus, linear algebra, and probability. For Harvard undergraduates, this would be fulfilled by Math 21a, Math 21b, and Stat 110. Stat 111 is strongly recommended but not required. Here is a non-comprehensive list of topics you should be familiar with before starting this book: - -- **Linear Algebra:** Vectors and matrices, matrix multiplication, matrix - inversion, eigenvalues and eigenvectors, and the Gram-Schmidt - process. -- **Multivariable Calculus:** Partial derivatives, gradients, - directional derivatives, the chain rule, Taylor series. -- **Probability:** Random variables, probability distributions, - expectation and variance, the law of iterated expectations (Adam's rule), covariance, conditional probability, Bayes's rule, and the law of total probability. - -+++ - -## Reinforcement learning in a nutshell - -Broadly speaking, RL is a subfield of machine learning that studies how an agent can learn to make sequential decisions in a dynamic environment. It provides a powerful framework for attacking a wide variety of problems, including robotic control, video games and board games, resource management, language modelling, and more. It also provides an interdisciplinary paradigm for studying animal and human behavior. Many of the most stunning results in machine learning, ranging from AlphaGo to ChatGPT, are built on top of RL. - -**How does reinforcement learning differ from other machine learning paradigms?** Machine learning is often broken down into the three subcategories of supervised learning, unsupervised learning, and reinforcement learning. - -- **Supervised learning.** Supervised learning concerns itself with - learning a mapping from inputs to outputs (e.g. image - classification). Typically the data takes the form of input-output - pairs that are assumed to be sampled independently from some - generating distribution. In RL, however, the data is generated by - the agent interacting with the environment, meaning the observations - depend on the agent's behaviour and are not independent from each - other. This requires a more general set of tools. - - Conversely, supervised learning is a well-studied field that - provides many useful tools for RL. For example, it may be useful to - use supervised learning to predict how valuable a given state is, or - to predict the probability of transitioning to a given state. - -- **Unsupervised learning.** Unsupervised learning deals with learning the - structure of data without the use of labels. - -### Challenges of reinforcement learning - -The RL task is quite general. It will be helpful to decompose it into a few subtasks that can be tackled separately. This decomposition also provides a useful framework for analyzing the algorithms we will encounter. - -**Policy evaluation (prediction):** How 'good' is a specific state, or state-action pair? That is, how much reward does it lead to in the long run? - -**Policy optimization (control):** Suppose we have a complete, accurate model of how the environment behaves. What is the best action to take in every scenario? - -**Recursion (bootstrapping):** How can we "reuse" our current predictions to generate new information? - -**Exploration-exploitation tradeoff:** Should we try new actions, or capitalize on actions that we currently believe to be good? - -+++ - -## Overview - -{ref}`bandits` discusses **multi-armed bandits,** a simple model for -reinforcement learning. In this setting, there are multiple arms, each with their own reward distribution. The agent must decide which arm to pull at each time step. - -{ref}`mdps` introduces (finite) **Markov Decision Processes**, a mathematical framework for describing interactive environments. Certain states and actions will be rewarded, and the agent's goal is to maximize its total reward. - -{ref}`fitted_dp` introduces **fitted dynamic programming** algorithms for solving MDPs when the state space is too large to be enumerated. These algorithms borrow ideas from supervised learning to approximate the value function (discussed in {ref}`mdps`). - -{ref}`lqr` is a standalone chapter on the **linear quadratic regulator**, -an important tool for *continuous control*, in which the state space is no longer finite (i.e. $|\mathcal{S}| < \infty$) but rather continuous (i.e. $|\mathcal{S}| = \mathbb{R}^{n_s}$). - -+++ - -| Chapter | States | Actions | Rewards (or costs) | -|:-------:|:------:|:-------:|:-------:| -| {ref}`bandits` | N/A | Finite | Stochastic | -| {ref}`mdps` | Finite | Finite | Deterministic | -| {ref}`fitted_dp` | Large or continuous | Finite | Deterministic | -| {ref}`lqr` | Continuous | Continuous | Deterministic | - -+++ - -## Notation - -We will use the following notation throughout the book. This notation is -inspired by {cite}`sutton_reinforcement_2018` and {cite}`agarwal_reinforcement_2022`. - -| Notation | Definition | -|:-------------:|:--------------------------| -| $s$ | A state. | -| $a$ | An action. | -| $r$ | A reward. | -| $p$ | A probability. | -| $\pi$ | A policy. | -| $V$ | A value function. | -| $Q$ | An action-value function. | -| $A$ | An advantage function. | -| $\gamma$ | A discount factor. | -| $\tau$ | A trajectory. | -| $\mathcal{S}$ | A state space. | -| $\mathcal{A}$ | An action space. | - -+++ - -## Programming - -Why include code in a textbook? We believe that implementing an algorithm is a strong test of your understanding of it; mathematical notation can often abstract away details, while a computer must be given every detailed instruction. We have sought to write concise, readable, idiomatic Python code that is self-contained within each file. This approach is inspired by {cite}`sussman_functional_2013`. - -This is an interactive book built with [Jupyter Book](https://jupyterbook.org/en/stable/intro.html). It uses [Python 3.11](https://docs.python.org/3.11/contents.html). It uses the [JAX](https://jax.readthedocs.io/en/latest/index.html) library for numerical computing. JAX was chosen for the clarity of its functional style and due to its mature RL ecosystem, sustained in large part by the Google DeepMind research group and a large body of open-source contributors. We use the standard [Gymnasium](https://gymnasium.farama.org/) library for interfacing with RL environments. - -+++ - -## O notation - -Throughout this chapter and the rest of the book, we will describe the -asymptotic behavior of a function using $O$ notation. - -For two functions $f(t)$ and $g(t)$, we say that $f(t) \le O(g(t))$ if -$f$ is asymptotically upper bounded by $g$. Formally, this means that -there exists some constant $C > 0$ such that $f(t) \le C \cdot g(t)$ for -all $t$ past some point $t_0$. - -We say $f(t) < o(g(t))$ if asymptotically $f$ grows strictly slower than -$g$. Formally, this means that for *any* scalar $C > 0$, there exists -some $t_0$ such that $f(t) \le C \cdot g(t)$ for all $t > t_0$. -Equivalently, we say $f(t) < o(g(t))$ if -$\lim_{t \to \infty} f(t)/g(t) = 0$. - -$f(t) = \Theta(g(t))$ means that $f$ and $g$ grow at the same rate -asymptotically. That is, $f(t) \le O(g(t))$ and $g(t) \le O(f(t))$. - -Finally, we use $f(t) \ge \Omega(g(t))$ to mean that $g(t) \le O(f(t))$, -and $f(t) > \omega(g(t))$ to mean that $g(t) < o(f(t))$. - -We also use the notation $\tilde O(g(t))$ to hide logarithmic factors. -That is, $f(t) = \tilde O(g(t))$ if there exists some constant $C$ such -that $f(t) \le C \cdot g(t) \cdot \log^k(t)$ for some $k$ and all $t$. - -Occasionally, we will also use $O(f(t))$ (or one of the other symbols) -as shorthand to manipulate function classes. For example, we might write -$O(f(t)) + O(g(t)) = O(f(t) + g(t))$ to mean that the sum of two -functions in $O(f(t))$ and $O(g(t))$ is in $O(f(t) + g(t))$. diff --git a/book/mdps.md b/book/mdps.md index 482b318..300f407 100644 --- a/book/mdps.md +++ b/book/mdps.md @@ -12,18 +12,16 @@ kernelspec: --- (mdps)= -# Finite Markov Decision Processes +# Markov Decision Processes -```{contents} -:local: -``` +## Introduction ```{code-cell} :tags: [hide-input] from typing import NamedTuple from jaxtyping import Float, Array -import jax.numpy as np +import jax.numpy as jnp from jax import vmap from functools import partial ``` @@ -32,16 +30,18 @@ The field of RL studies how an agent can learn to make sequential decisions in a Let’s consider some examples of sequential decision problems to identify the key common properties we’d like to capture: -- **Board games** like chess or Go, where the player takes turns with - the opponent to make moves on the board. - -- **Video games** like Breakout, where the player - controls a character to reach the goal. - -- **Robotic control**, where the robot can move and interact with the - real-world environment to complete some task. +- **Board games and video games,** where a player takes actions in a virtual environment. +- **Inventory management,** where a company must efficiently move resources from producers to consumers. +- **Robotic control**, where a robot can move and interact with the real world to complete some task. -All of these fit into the RL framework. Furthermore, these are environments where the **state transitions**, the “rules” of the environment, only depend on the *most recent* state and action. This is called the **Markov property**. +In these environments and many others, the **state transitions**, +the “rules” of the environment, +only depend on the *most recent* state and action (generally speaking). +For example, if you want to take a break while playing a game of chess, +you could take a picture of the board, +and later on reset the board to that state and continue playing; +the past history of moves doesn't matter (generally speaking). +This is called the **Markov property.** :::{prf:definition} Markov property :label: markov @@ -52,38 +52,33 @@ state and action: $$\P(s_{\hi+1} \mid s_0, a_0, \dots, s_\hi, a_\hi) = P(s_{\hi+1} \mid s_\hi, a_\hi)$$ -where $P : \mathcal{S} \times \mathcal{A} \to \Delta(\mathcal{S})$ describes the state transitions. +where $P : \mathcal{S} \times \mathcal{A} \to \triangle(\mathcal{S})$ describes the state transitions. (We’ll elaborate on this notation later in the chapter.) ::: -We’ll see that this simple assumption leads to a rich set of problems -and algorithms. Environments with the Markov property are called -**Markov decision processes** (MDPs) and will be the focus of this -chapter. +Environments that satisfy the Markov property are called **Markov decision processes** (MDPs). +This chapter will focus on introducing core vocabulary for MDPs that will be useful throughout the book. :::{attention} -What information might be encoded in the state for each of -the above examples? What might the valid set of actions be? Describe the -state transitions heuristically and verify that they satisfy the Markov -property. +What information might be encoded in the _state_ for each of the above examples? +What might the valid set of _actions_ be? +Describe the _state transitions_ heuristically and verify that they satisfy the Markov property. ::: -MDPs are usually classified as **finite-horizon**, where the -interactions end after some finite number of time steps, or -**infinite-horizon**, where the interactions can continue indefinitely. -We’ll begin with the finite-horizon case and discuss the -infinite-horizon case in the second half of the chapter. +MDPs are usually classified as **finite-horizon**, where the interactions end after some finite number of time steps, +or **infinite-horizon**, where the interactions can continue indefinitely. +We’ll begin with the finite-horizon case and discuss the infinite-horizon case in the second half of the chapter. -In each setting, we’ll describe how to evaluate different **policies** -(strategies for choosing actions) and how to compute (or approximate) -the **optimal policy** for a given MDP. We’ll introduce the **Bellman -consistency condition**, which allows us to analyze the whole series of -interactions in terms of individual timesteps. +We’ll describe how to _evaluate_ different strategies, called **policies,** and how to compute (or approximate) +the **optimal policy** for a given MDP. +We’ll introduce the **Bellman consistency condition**, which allows us to analyze the whole sequence of interactions in terms of individual timesteps. ## Finite-horizon MDPs +### Definition + ::::{prf:definition} Finite-horizon Markov decision process -:label: finite_mdp +:label: finite_horizon_mdp The components of a finite-horizon Markov decision process are: @@ -93,10 +88,10 @@ The components of a finite-horizon Markov decision process are: 2. The **actions** that the agent can take. We use $\mathcal{A}$ to denote the set of possible actions, called the **action space**. -3. Some **initial state distribution** $\mu \in \Delta(\mathcal{S})$. +3. Some **initial state distribution** $\mu \in \triangle(\mathcal{S})$. 4. The **state transitions** (a.k.a. **dynamics**) - $P : \mathcal{S} \times \mathcal{A} \to \Delta(\mathcal{S})$ that describe what state the agent + $P : \mathcal{S} \times \mathcal{A} \to \triangle(\mathcal{S})$ that describe what state the agent transitions to after taking an action. 5. The **reward** signal. In this course we'll take it to be a @@ -104,13 +99,13 @@ The components of a finite-horizon Markov decision process are: $r : \mathcal{S} \times \mathcal{A} \to \mathbb{R}$, but in general many results will extend to a *stochastic* reward signal. -6. A time horizon $H \in \mathbb{N}$ that specifies the number of +6. A time horizon $\hor \in \mathbb{N}$ that specifies the number of interactions in an **episode**. Combined together, these objects specify a finite-horizon Markov decision process: -$$M = (\mathcal{S}, \mathcal{A}, \mu, P, r, H).$$ +$$M = (\mathcal{S}, \mathcal{A}, \mu, P, r, \hor).$$ When there are **finitely** many states and actions, i.e. $|\mathcal{S}|, |\mathcal{A}| < \infty$, we can express @@ -119,23 +114,24 @@ values): $$ \begin{aligned} - r &\in \mathbb{R}^{|\mathcal{S}| \times |\mathcal{A}|} & + \mu &\in [0, 1]^{|\mathcal{S}|} & P &\in [0, 1]^{(|\mathcal{S} \times \mathcal{A}|) \times |\mathcal{S}|} & - \mu &\in [0, 1]^{|\mathcal{S}|} + r &\in \mathbb{R}^{|\mathcal{S}| \times |\mathcal{A}|} \end{aligned} $$ +:::: :::{attention} -Verify that these types make sense! +Verify that the types and shapes provided above make sense! ::: -:::: ```{code-cell} class MDP(NamedTuple): + """A description of a Markov decision process with finitely many states and actions.""" S: int # number of states A: int # number of actions μ: Float[Array, " S"] - P: Float[Array, "S A S"] + P: Float[Array, "S A S"] # "current" state, "current" action, "next" state r: Float[Array, "S A"] H: int γ: float = 1.0 # discount factor (used later) @@ -144,22 +140,21 @@ class MDP(NamedTuple): :::{prf:example} Tidying MDP :label: tidy_mdp -Let's consider an extremely simple decision problem throughout this -chapter: the task of keeping your room tidy! +Let's consider a simple decision problem throughout this chapter: +the task of keeping your room tidy! Your room has the possible states -$\mathcal{S} = \{ \text{orderly}, \text{messy} \}$. You can take either -of the actions $\mathcal{A} = \{ \text{ignore}, \text{tidy} \}$. The room starts -off orderly. +$\mathcal{S} = \{ \text{orderly}, \text{messy} \}.$ +You can take either of the actions $\mathcal{A} = \{ \text{ignore}, \text{tidy} \}.$ +The room starts off orderly. -The **state transitions** are as follows: if you tidy the room, it becomes -(or remains) orderly; if you ignore the room, it _might_ become messy (see table -below). +The **state transitions** are as follows: +if you tidy the room, it becomes (or remains) orderly; +if you ignore the room, it _might_ become messy (see table below). -The **rewards** are as follows: You get penalized for tidying an orderly -room (a waste of time) or ignoring a messy room, but you get rewarded -for ignoring an orderly room (since you can enjoy). Tidying a messy room -is a chore that gives no reward. +The **rewards** are as follows: You get penalized for tidying an orderly room (a waste of time) or ignoring a messy room, +but you get rewarded for ignoring an orderly room (since you can enjoy your additional time). +Tidying a messy room is a chore that gives no reward. These are summarized in the following table: @@ -171,7 +166,7 @@ $$\begin{array}{ccccc} \text{messy} & \text{tidy} & 1 & 0 & 0 \\ \end{array}$$ -Consider a time horizon of $H = 7$ days (one interaction per day). Let +Consider a time horizon of $\hor = 7$ days (one interaction per day). Let $t = 0$ correspond to Monday and $t = 6$ correspond to Sunday. ::: @@ -179,8 +174,8 @@ $t = 0$ correspond to Monday and $t = 6$ correspond to Sunday. tidy_mdp = MDP( S=2, # 0 = orderly, 1 = messy A=2, # 0 = ignore, 1 = tidy - μ=np.array([1.0, 0.0]), # start in orderly state - P=np.array( + μ=jnp.array([1.0, 0.0]), # start in orderly state + P=jnp.array( [ [ [0.7, 0.3], # orderly, ignore @@ -192,15 +187,15 @@ tidy_mdp = MDP( ], ] ), - r=np.array( + r=jnp.array( [ [ - 1.0, # orderly, ignore + 1.0, # orderly, ignore -1.0, # orderly, tidy ], [ -1.0, # messy, ignore - 0.0, # messy, tidy + 0.0, # messy, tidy ], ] ), @@ -213,12 +208,12 @@ tidy_mdp = MDP( :::{prf:definition} Policies :label: policy -A **policy** $\pi$ describes the agent's strategy: which actions it -takes in a given situation. A key goal of RL is to find the **optimal -policy** that maximizes the total reward on average. +A **policy** $\pi$ describes the agent's strategy: +which actions it takes in a given situation. +A key goal of RL is to find the **optimal policy** that maximizes the total reward on average. There are three axes along which policies can vary: their outputs, -inputs, and time-dependence. We'll discuss each of these in turn. +inputs, and time-dependence. 1. **Deterministic or stochastic.** A deterministic policy outputs actions while a stochastic policy outputs *distributions* over @@ -230,31 +225,22 @@ inputs, and time-dependence. We'll discuss each of these in turn. actions, and rewards. We'll only consider state-dependent policies in this course. -3. **Stationary or time-dependent.** A stationary policy remains the - same function at all time steps, while a time-dependent policy - $\pi = \{ \pi_0, \dots, \pi_{H-1} \}$ specifies a different function - $\pi_\hi$ at each time step $\hi$. +3. **Stationary or time-dependent.** A stationary (a.k.a. time-homogeneous) policy + remains the same function at all time steps, while a time-dependent policy can depend on the current timestep. + For consistency with states and actions, we will denote the timestep as a subscript, + i.e. $\pi = \{ \pi_0, \dots, \pi_{\hor-1} \}.$ ::: Note that for finite state and action spaces, we can represent a randomized mapping $\mathcal{S} \to \Delta(\mathcal{A})$ -as a matrix $\pi \in [0, 1]^{\mathcal{S}, \mathcal{A}}$ where each row describes +as a matrix $\pi \in [0, 1]^{\mathcal{S} \times \mathcal{A}}$ where each row describes the policy's distribution over actions for the corresponding state. -```{code-cell} -# In code, we use the `Policy` type to represent a randomized mapping from states to actions. -# In the finite-horizon case, an array of `H` of these, one for at each time step, -# would constitute a time-dependent policy. -Policy = Float[Array, "S A"] -``` - -A fascinating result is that every finite-horizon MDP has an optimal -deterministic time-dependent policy! Intuitively, the Markov property -implies that the current state contains all the information we need to -make the optimal decision. We'll prove this result constructively later -in the chapter. +A fascinating result is that every finite-horizon MDP has an optimal deterministic time-dependent policy! +Intuitively, the Markov property implies that the current state contains all the information we need to make the optimal decision. +We'll prove this result constructively later in the chapter. -:::{prf:example} Tidying policies +:::{prf:example} Policies for the tidying MDP :label: tidy_policy Here are some possible policies for the tidying MDP {prf:ref}`tidy_mdp`: @@ -270,9 +256,9 @@ Here are some possible policies for the tidying MDP {prf:ref}`tidy_mdp`: ```{code-cell} # arrays of shape (H, S, A) represent time-dependent policies -tidy_policy_always_tidy = np.zeros((7, 2, 2)).at[:, :, 1].set(1.0) -tidy_policy_weekends = np.zeros((7, 2, 2)).at[5:7, :, 1].set(1.0).at[0:5, :, 0].set(1.0) -tidy_policy_messy_only = np.zeros((7, 2, 2)).at[:, 1, 1].set(1.0).at[:, 0, 0].set(1.0) +tidy_policy_always_tidy = jnp.zeros((7, 2, 2)).at[:, :, 1].set(1.0) +tidy_policy_weekends = jnp.zeros((7, 2, 2)).at[5:7, :, 1].set(1.0).at[0:5, :, 0].set(1.0) +tidy_policy_messy_only = jnp.zeros((7, 2, 2)).at[:, 1, 1].set(1.0).at[:, 0, 0].set(1.0) ``` (trajectories)= @@ -285,84 +271,92 @@ A sequence of states, actions, and rewards is called a **trajectory**: $$\tau = (s_0, a_0, r_0, \dots, s_{H-1}, a_{H-1}, r_{H-1})$$ -where -$r_\hi = r(s_\hi, a_\hi)$. (Note that sources differ as to whether to include -the reward at the final time step. This is a minor detail.) +where $r_\hi = r(s_\hi, a_\hi)$. +(Note that some sources omit the reward at the final time step. This is a minor detail.) ::: ```{code-cell} class Transition(NamedTuple): + """A single state-action-reward interaction with the environment.""" s: int a: int r: float +``` +Once we've chosen a policy, +we can sample trajectories by repeatedly choosing actions according to the policy, +transitioning according to the state transitions, and observing the rewards. -Trajectory = list[Transition] -``` +:::{image} shared/trajectory.png +:width: 240px +:align: center +::: -Once we've chosen a policy, we can sample trajectories by repeatedly -choosing actions according to the policy, transitioning according to the -state transitions, and observing the rewards. That is, a policy induces -a distribution $\rho^{\pi}$ over trajectories. (We assume that $\mu$ and -$P$ are clear from context.) +That is, a policy induces a distribution $\rho^{\pi}$ over trajectories. +(We assume that $\mu$ and $P$ are clear from context.) :::{prf:example} Trajectories in the tidying environment :label: tidy_traj Here is a possible trajectory for the tidying example: -| $t$ | $0$ | $1$ | $2$ | $3$ | $4$ | $5$ | $6$ | -|:---:|:-------:|:-------:|:-------:|:------:|:-----:|:-------:|:-------:| -| $s$ | orderly | orderly | orderly | messy | messy | orderly | orderly | -| $a$ | tidy | ignore | ignore | ignore | tidy | ignore | ignore | -| $r$ | $-1$ | $1$ | $1$ | $-1$ | $0$ | $1$ | $1$ | +| $\hi$ | $0$ | $1$ | $2$ | $3$ | $4$ | $5$ | $6$ | +|:-----:|:-------:|:-------:|:-------:|:------:|:-----:|:-------:|:-------:| +| $s$ | orderly | orderly | orderly | messy | messy | orderly | orderly | +| $a$ | tidy | ignore | ignore | ignore | tidy | ignore | ignore | +| $r$ | $-1$ | $1$ | $1$ | $-1$ | $0$ | $1$ | $1$ | Could any of the policies in {prf:ref}`tidy_policy` have generated this trajectory? ::: -Note that for a state-dependent policy, using the Markov property {prf:ref}`markov`, we can specify this probability distribution in -an **autoregressive** way (i.e. one timestep at a time): +Note that for a state-dependent policy, using the Markov property {prf:ref}`markov`, +we can write down the likelihood function of this probability distribution in an **autoregressive** way (i.e. one timestep at a time): :::{prf:definition} Autoregressive trajectory distribution :label: autoregressive_trajectories -$$\rho^{\pi}(\tau) := \mu(s_0) \pi_0(a_0 \mid s_0) P(s_1 \mid s_0, a_0) \cdots P(s_{H-1} \mid s_{H-2}, a_{H-2}) \pi_{H-1}(a_{H-1} \mid s_{H-1})$$ +$$\rho^{\pi}(\tau) := \mu(s_0) \pi_0(a_0 \mid s_0) P(s_1 \mid s_0, a_0) \cdots P(s_{\hor-1} \mid s_{\hor-2}, a_{\hor-2}) \pi_{\hor-1}(a_{\hor-1} \mid s_{\hor-1})$$ ::: ```{code-cell} -def trajectory_log_likelihood(mdp: MDP, tau: Trajectory, pi: Policy) -> float: - """ - Compute the log likelihood of a trajectory under a given MDP and policy. - """ - total = np.log(mdp.μ[tau[0].s]) - total += np.log(pi[tau[0].s, tau[0].a]) +def trajectory_log_likelihood( + mdp: MDP, + τ: list[Transition], + π: Float[Array, "S A"], +) -> float: + """Compute the log-likelihood of a trajectory under a given MDP and policy.""" + total = jnp.log(mdp.μ[τ[0].s]) + total += jnp.log(π[τ[0].s, τ[0].a]) for i in range(1, mdp.H): - total += np.log(mdp.P[tau[i - 1].s, tau[i - 1].a, tau[i].s]) - total += np.log(pi[tau[i].s, tau[i].a]) + total += jnp.log(mdp.P[τ[i - 1].s, τ[i - 1].a, τ[i].s]) + total += jnp.log(π[τ[i].s, τ[i].a]) return total ``` -:::{tip} +:::{attention} How would you modify this to include stochastic rewards? ::: -For a deterministic policy $\pi$, we have that -$\pi_\hi(a \mid s) = \mathbb{I}[a = \pi_\hi(s)]$; that is, the probability -of taking an action is $1$ if it's the unique action prescribed by the -policy for that state and $0$ otherwise. In this case, the only -randomness in sampling trajectories comes from the initial state -distribution $\mu$ and the state transitions $P$. +For a deterministic policy $\pi$, we have that $\pi_\hi(a \mid s) = \mathbb{I}[a = \pi_\hi(s)]$; +that is, the probability of taking an action is $1$ if it's the unique action prescribed by the policy for that state and $0$ otherwise. +In this case, the only randomness in sampling trajectories comes from the initial state distribution $\mu$ and the state transitions $P$. +++ ### Value functions -The main goal of RL is to find a policy that maximizes the average total -reward $r_0 + \cdots + r_{H-1}$. (Note that this is a random variable -that depends on the policy.) Let's introduce some notation for analyzing -this quantity. +The main goal of RL is to find a policy that maximizes the expected total +reward $\E [r_0 + \cdots + r_{\hor-1}]$. -A policy's **value function** at time $h$ is its expected remaining reward *from a given state*: +:::{attention} +Note that $r_0 + \cdots + r_{\hor-1}$ is a random variable. +What sources of randomness does it depend on? +Describe the generating process. +::: + +Let's introduce some notation for analyzing this quantity. + +A policy's **value function** at time $\hi$ is its expected remaining reward *from a given state*: :::{prf:definition} Value function :label: value @@ -393,7 +387,7 @@ def q_to_v( Compute the value function for a given policy in a known finite MDP at a single timestep from its action-value function. """ - return np.sum(policy * q, axis=1) + return jnp.sum(policy * q, axis=1) ``` and the @@ -415,12 +409,13 @@ def v_to_q( return mdp.r + mdp.γ * mdp.P @ v +# convert a list of v functions to a list of q functions v_ary_to_q_ary = vmap(v_to_q, in_axes=(None, 0)) ``` #### Greedy policies -For any given $q \in \mathbb{R}^{|\mathcal{S}| \times |\mathcal{A}|}$, we can define the **greedy policy** $\hat \pi_q$ as the policy that selects the action with the highest $q$-value at each state: +For any given $Q \in \mathbb{R}^{|\mathcal{S}| \times |\mathcal{A}|}$, we can define the **greedy policy** $\hat \pi_Q$ as the policy that selects the action with the highest $Q$-value at each state: ```{code-cell} def q_to_greedy(q: Float[Array, "S A"]) -> Float[Array, "S A"]: @@ -428,7 +423,9 @@ def q_to_greedy(q: Float[Array, "S A"]) -> Float[Array, "S A"]: Get the (deterministic) greedy policy w.r.t. an action-value function. Return the policy as a matrix of shape (S, A) where each row is a one-hot vector. """ - return np.eye(q.shape[1])[np.argmax(q, axis=1)] + A = q.shape[1] + a_ary = jnp.argmax(q, axis=1) + return jnp.eye(A)[a_ary] def v_to_greedy(mdp: MDP, v: Float[Array, " S"]) -> Float[Array, "S A"]: @@ -448,7 +445,9 @@ who is credited with introducing dynamic programming in 1953. :::{prf:theorem} Bellman consistency equation for the value function :label: bellman_consistency -$$V_\hi^\pi(s) = \E_{\substack{a \sim \pi_\hi(s) \\ s' \sim P(s, a)}} [r(s, a) + V_{\hi+1}^\pi(s')]$$ +$$ +V_\hi^\pi(s) = \E_{\substack{a \sim \pi_\hi(s) \\ s' \sim P(s, a)}} [r(s, a) + V_{\hi+1}^\pi(s')] +$$ ::: ```{code-cell} @@ -462,11 +461,11 @@ def check_bellman_consistency_v( satisfies the Bellman consistency equation. """ return all( - np.allclose( + jnp.allclose( # lhs v_ary[h], # rhs - np.sum(policy[h] * (mdp.r + mdp.γ * mdp.P @ v_ary[h + 1]), axis=1), + jnp.sum(policy[h] * (mdp.r + mdp.γ * mdp.P @ v_ary[h + 1]), axis=1), ) for h in range(mdp.H - 1) ) @@ -529,7 +528,7 @@ def bellman_operator_looping( Looping definition of the Bellman operator. Concise version is below """ - v_new = np.zeros(mdp.S) + v_new = jnp.zeros(mdp.S) for s in range(mdp.S): for a in range(mdp.A): for s_next in range(mdp.S): @@ -548,7 +547,7 @@ def bellman_operator( v: Float[Array, " S"], ) -> Float[Array, " S"]: """For a known finite MDP, the Bellman operator can be exactly evaluated.""" - return np.sum(policy * (mdp.r + mdp.γ * mdp.P @ v), axis=1) + return jnp.sum(policy * (mdp.r + mdp.γ * mdp.P @ v), axis=1) return q_to_v(policy, v_to_q(mdp, v)) # equivalent ``` @@ -595,10 +594,10 @@ equation to compute the value function at each time step. ```{code-cell} def dp_eval_finite(mdp: MDP, policy: Float[Array, "S A"]) -> Float[Array, "H S"]: """Evaluate a policy using dynamic programming.""" - V_ary = [None] * mdp.H + [np.zeros(mdp.S)] # initialize to 0 at end of time horizon + V_ary = [None] * mdp.H + [jnp.zeros(mdp.S)] # initialize to 0 at end of time horizon for h in range(mdp.H - 1, -1, -1): V_ary[h] = bellman_operator(mdp, policy[h], V_ary[h + 1]) - return np.stack(V_ary[:-1]) + return jnp.stack(V_ary[:-1]) ``` This runs in time $O(H \cdot |\mathcal{S}|^2 \cdot |\mathcal{A}|)$ by counting the @@ -811,16 +810,16 @@ $$ def find_optimal_policy(mdp: MDP): Q = [None] * mdp.H pi = [None] * mdp.H - V = [None] * mdp.H + [np.zeros(mdp.S)] # initialize to 0 at end of time horizon + V = [None] * mdp.H + [jnp.zeros(mdp.S)] # initialize to 0 at end of time horizon for h in range(mdp.H - 1, -1, -1): Q[h] = mdp.r + mdp.P @ V[h + 1] - pi[h] = np.eye(mdp.S)[np.argmax(Q[h], axis=1)] # one-hot - V[h] = np.max(Q[h], axis=1) + pi[h] = jnp.eye(mdp.S)[jnp.argmax(Q[h], axis=1)] # one-hot + V[h] = jnp.max(Q[h], axis=1) - Q = np.stack(Q) - pi = np.stack(pi) - V = np.stack(V[:-1]) + Q = jnp.stack(Q) + pi = jnp.stack(pi) + V = jnp.stack(V[:-1]) return pi, V, Q ``` @@ -839,9 +838,9 @@ setting. ```{code-cell} π_opt, V_opt, Q_opt = find_optimal_policy(tidy_mdp) -assert np.allclose(π_opt, tidy_policy_messy_only) -assert np.allclose(V_opt, V_messy) -assert np.allclose(Q_opt[:-1], v_ary_to_q_ary(tidy_mdp, V_messy)[1:]) +assert jnp.allclose(π_opt, tidy_policy_messy_only) +assert jnp.allclose(V_opt, V_messy) +assert jnp.allclose(Q_opt[:-1], v_ary_to_q_ary(tidy_mdp, V_messy)[1:]) "Assertions passed (the 'tidy when messy' policy is optimal)" ``` @@ -892,7 +891,7 @@ The other components of the MDP remain the same: $$M = (\mathcal{S}, \mathcal{A}, \mu, P, r, \gamma).$$ -Code-wise, we can reuse the `MDP` class from before {prf:ref}`finite_mdp` and set `mdp.H = float('inf')`. +Code-wise, we can reuse the `MDP` class from before {prf:ref}`finite_horizon_mdp` and set `mdp.H = float('inf')`. ```{code-cell} tidy_mdp_inf = tidy_mdp._replace(H=float("inf"), γ=0.95) @@ -1101,10 +1100,10 @@ least one nonzero element.) def eval_deterministic_infinite( mdp: MDP, policy: Float[Array, "S A"] ) -> Float[Array, " S"]: - pi = np.argmax(policy, axis=1) # un-one-hot - P_π = mdp.P[np.arange(mdp.S), pi] - r_π = mdp.r[np.arange(mdp.S), pi] - return np.linalg.solve(np.eye(mdp.S) - mdp.γ * P_π, r_π) + pi = jnp.argmax(policy, axis=1) # un-one-hot + P_π = mdp.P[jnp.arange(mdp.S), pi] + r_π = mdp.r[jnp.arange(mdp.S), pi] + return jnp.linalg.solve(jnp.eye(mdp.S) - mdp.γ * P_π, r_π) ``` :::{prf:example} Tidying policy evaluation @@ -1153,7 +1152,7 @@ takes $O(|\mathcal{S}|^2)$ time for the matrix-vector multiplication. ```{code-cell} def supremum_norm(v): - return np.max(np.abs(v)) # same as np.linalg.norm(v, np.inf) + return jnp.max(jnp.abs(v)) # same as jnp.linalg.norm(v, jnp.inf) def loop_until_convergence(op, v, ε=1e-6): @@ -1167,7 +1166,7 @@ def loop_until_convergence(op, v, ε=1e-6): def iterative_evaluation(mdp: MDP, pi: Float[Array, "S A"], ε=1e-6) -> Float[Array, " S"]: op = partial(bellman_operator, mdp, pi) - return loop_until_convergence(op, np.zeros(mdp.S), ε) + return loop_until_convergence(op, jnp.zeros(mdp.S), ε) ``` Then, as we showed in {eq}`bellman_convergence`, by the Banach fixed-point theorem: @@ -1259,11 +1258,11 @@ gives the **Bellman optimality operator** ```{code-cell} def bellman_optimality_operator(mdp: MDP, v: Float[Array, " S"]) -> Float[Array, " S"]: - return np.max(mdp.r + mdp.γ * mdp.P @ v, axis=1) + return jnp.max(mdp.r + mdp.γ * mdp.P @ v, axis=1) def check_optimal(v: Float[Array, " S"], mdp: MDP): - return np.allclose(v, bellman_optimality_operator(v, mdp)) + return jnp.allclose(v, bellman_optimality_operator(v, mdp)) ``` (value_iteration)= @@ -1278,7 +1277,7 @@ algorithm is known as **value iteration**. def value_iteration(mdp: MDP, ε: float = 1e-6) -> Float[Array, " S"]: """Iterate the Bellman optimality operator until convergence.""" op = partial(bellman_optimality_operator, mdp) - return loop_until_convergence(op, np.zeros(mdp.S), ε) + return loop_until_convergence(op, jnp.zeros(mdp.S), ε) ``` ```{code-cell} @@ -1386,7 +1385,7 @@ def policy_iteration(mdp: MDP, ε=1e-6) -> Float[Array, "S A"]: """Iteratively improve the policy and value function.""" def op(pi): return v_to_greedy(mdp, eval_deterministic_infinite(mdp, pi)) - π_init = np.ones((mdp.S, mdp.A)) / mdp.A # uniform random policy + π_init = jnp.ones((mdp.S, mdp.A)) / mdp.A # uniform random policy return loop_until_convergence(op, π_init, ε) ``` diff --git a/book/pg.md b/book/pg.md index a1c9f82..8ee5c0c 100644 --- a/book/pg.md +++ b/book/pg.md @@ -11,27 +11,48 @@ kernelspec: name: python3 --- -# Policy Gradient Algorithms - -A key task in RL is finding the **optimal policy** in a given environment, -that is, the policy that achieves the most total reward in all states. -Given this task, why not optimize directly over _policies?_ - -Algorithms based on this idea are called _policy optimization algorithms._ -We've already seen some examples of this, +(pg)= +# Policy Optimization + +The core task of RL is finding the **optimal policy** in a given environment. +This is essentially an _optimization problem:_ +out of some space of policies, +we want to find the one that achieves the maximum total reward (in expectation). + +It's typically intractable to compute the optimal policy exactly. +Instead, **policy optimization algorithms** start from some randomly initialized policy, +and then _improve_ it step by step. +We've already seen some examples of these, namely {ref}`policy_iteration` for finite MDPs and {ref}`iterative_lqr` in continuous control. - -**Policy gradient algorithms** form a specific subclass for policies that can be described by a set of **parameters.** -These are responsible for groundbreaking applications including AlphaGo, OpenAI Five, and large language models, +In particular, we often use policies that can be described by some finite set of _parameters._ +For such parameterized policies, +we can approximate the **policy gradient:** +the gradient of the expected total reward with respect to the parameters. +This tells us the direction the parameters should be updated to achieve a higher total reward (in expectation). +Policy gradient methods are responsible for groundbreaking applications including AlphaGo, OpenAI Five, and large language models, many of which use policies parameterized as deep neural networks. 1. We begin the chapter with a short review of gradient ascent, -a simple and general **optimization method.** -2. We'll then apply this technique directly to maximize the _\hiotal reward_. +a general **optimization method.** +2. We'll then see how to estimate the **policy gradient,** + enabling us to apply (stochastic) gradient ascent in the RL setting. 3. Then we'll explore some _proximal optimization_ techniques that ensure the steps taken are "not too large". This is helpful to stabilize training and widely used in practice. -+++ +```{code-cell} ipython3 +import numpy as np +import jax +from jaxtyping import Float, Array +from bokeh.plotting import figure, show, output_notebook +from bokeh.models import Arrow, VeeHead, ColumnDataSource, LinearColorMapper, BasicTicker, ColorBar +from bokeh.transform import linear_cmap +from bokeh.layouts import gridplot +from typing import TypeVar, Callable + +Params = TypeVar("Params") + +output_notebook() +``` ## Gradient Ascent @@ -41,7 +62,43 @@ where you keep taking steps in the steepest direction upwards. Here, your vertical position $y$ is the function being optimized, and your horizontal position $(x, z)$ is the input to the function. The _slope_ of the mountain at your current position is given by the _gradient_, -written $\nabla y(x, z) \in \R^2$. +written $\nabla y(x, z) \in \mathbb{R}^2$. + +```{code-cell} ipython3 +def f(x, y): + """Himmelblau's function""" + return (x**2 + y - 11)**2 + (x + y**2 - 7)**2 + +x = np.linspace(-5, 5, 400) +y = np.linspace(-5, 5, 400) +X, Y = np.meshgrid(x, y) +Z = f(X, Y) + +p = figure(width=600, height=600, title="Himmelblau's function") + +mapper = LinearColorMapper(palette="Viridis256", low=Z.min(), high=Z.max()) +p.image(image=[Z], x=-5, y=-5, dw=10, dh=10, color_mapper=mapper) + +color_bar = ColorBar(color_mapper=mapper) +p.add_layout(color_bar, 'right') + +tx, ty = 1., 1. +gx, gy = jax.grad(f, argnums=(0, 1))(tx, ty) + +p.scatter(x=[tx], y=[ty], size=10, color="red") + +p.add_layout(Arrow( + end=VeeHead(size=15), + x_start=tx, + y_start=ty, + x_end=tx + gx.item() * 0.01, + y_end=ty + gy.item() * 0.01, + line_color="blue", +)) + +show(p) +``` + For differentiable functions, this can be thought of as the vector of partial derivatives, $$ @@ -77,7 +134,7 @@ The case of a two-dimensional input is easy to visualize. But this idea can be straightforwardly extended to higher-dimensional inputs. From now on, we'll use $J$ to denote the function we're trying to maximize, -and $\theta$ to denote the parameters being optimized over. +and $\theta$ to denote the parameters being optimized over. (In the above example, $\theta = \begin{pmatrix} x & z \end{pmatrix}^\top$). Notice that our parameters will stop changing once $\nabla J(\theta) = 0.$ Once we reach this **stationary point,** our current parameters are 'locally optimal' in some sense; @@ -85,6 +142,40 @@ it's impossible to increase the function by moving in any direction. If $J$ is _convex_, then the only point where this happens is at the *global optimum.* Otherwise, if $J$ is nonconvex, the best we can hope for is a *local optimum.* +:::{note} +How does a computer compute the gradient of a function? + +One way is _symbolic differentiation,_ +which is similar to the way you might compute it by hand: +the computer applies a list of rules to transform the _symbols_ involved. +Python's `sympy` package supports symbolic differentiation. +However, functions implemented in code may not always have a straightforward symbolic representation. + +Another way is _numerical differentiation,_ +which is based on the limit definition of a (directional) derivative: + +$$ +\nabla_{\boldsymbol{u}} J(\boldsymbol{x}) = \lim_{\varepsilon \to 0} +\frac{J(\boldsymbol{x} + \varepsilon \boldsymbol{u}) - J(\boldsymbol{x})}{\varepsilon} +$$ + +Then, we can substitute a small value of $\varepsilon$ on the r.h.s. to approximate the directional derivative. +How small, though? If we need an accurate estimate, +we may need such a small value of $\varepsilon$ that typical computers will run into rounding errors. +Also, to compute the full gradient, +we would need to compute the r.h.s. once for each input dimension. +This is an issue if computing $J$ is expensive. + +**Automatic differentiation** achieves the best of both worlds. +Like symbolic differentiation, +we manually implement the derivative rules for a few basic operations. +However, instead of executing these on the _symbols_, +we execute them on the _values_ when the function gets called, +like in numerical differentiation. +This allows us to differentiate through programming constructs such as branches or loops, +and doesn't involve any arbitrarily small values. +::: + +++ ### Stochastic gradient ascent @@ -97,13 +188,17 @@ In these cases, we often compute some _estimate_ of the gradient at each step, $ This is called **stochastic** gradient ascent. In the SL example above, we might randomly choose a *minibatch* of samples and use them to estimate the true prediction error. (This approach is known as **_minibatch_ SGD**.) -```python -def sgd_pseudocode( +```{code-cell} ipython3 +def sgd( θ_init: Params, estimate_gradient: Callable[[Params], Params], η: float, n_steps: int, ): + """Perform `n_steps` steps of SGD. + + `estimate_gradient` eats the current parameters and returns an estimate of the objective function's gradient at those parameters. + """ θ = θ_init for step in range(n_steps): θ += η * estimate_gradient(θ) @@ -113,7 +208,9 @@ def sgd_pseudocode( What makes one gradient estimator better than another? Ideally, we want this estimator to be **unbiased;** that is, on average, it matches a single true gradient step: -$$\E [\tilde \nabla J(\theta)] = \nabla J(\theta).$$ +$$ +\E [\tilde \nabla J(\theta)] = \nabla J(\theta). +$$ We also want the _variance_ of the estimator to be low so that its performance doesn't change drastically at each step. @@ -160,13 +257,21 @@ What does $\theta$ correspond to, though? In general, $\pi$ is a function, and optimizing over the space of arbitrary input-output mappings would be intractable. Instead, we need to describe $\pi$ in terms of some finite set of _parameters_ $\theta$. ++++ + (parameterizations)= ### Example policy parameterizations What are some ways we could parameterize our policy? ++++ + +#### Tabular representation + If both the state and action spaces are finite, perhaps we could simply learn a preference value $\theta_{s,a}$ for each state-action pair. -Then to turn this into a valid distribution, we perform a "softmax" operation: we exponentiate each of them, and divide by the total: +Then to turn this into a valid distribution, we perform a **softmax** operation: +we exponentiate each of them, +and then normalize to form a valid distribution: $$\pi^\text{softmax}_\theta(a | s) = \frac{\exp(\theta_{s,a})}{\sum_{s,a'} \exp (\theta_{s,a'})}.$$ @@ -211,6 +316,8 @@ More generally, we could map states and actions to unnormalized scores via some The score can then be written as $$\nabla \log \pi_\theta(a|s) = \nabla f_\theta(s, a) - \E_{a \sim \pi_\theta(s)} \nabla f_\theta (s, a')$$ ++++ + ### Continuous action spaces Consider a continuous $n$-dimensional action space $\mathcal{A} = \mathbb{R}^n$. Then for a stochastic policy, we could use a function to predict the *mean* action and then add some random noise about it. For example, we could use a neural network to predict the mean action $\mu_\theta(s)$ and then add some noise $\epsilon \sim \mathcal{N}(0, \sigma^2 I)$ to it: @@ -893,5 +1000,3 @@ TODO - Trust region policy optimization - Natural policy gradient - Proximal policy optimization - - diff --git a/book/planning.md b/book/planning.md new file mode 100644 index 0000000..6b93bda --- /dev/null +++ b/book/planning.md @@ -0,0 +1,15 @@ + + + ++++ + ++++ + ++++ + +(planning)= +# Planning + +## Monte Carlo Tree Search + +(INCOMPLETE) diff --git a/book/shared/npg_line.png b/book/shared/npg_line.png new file mode 100644 index 0000000000000000000000000000000000000000..8203102339ac104c6048009c86b407fcd7916062 GIT binary patch literal 32010 zcma&O1z1+=);0WqfCxx;hk}H(NF#`VfHaDTq=eEb($WZm2q>t4l#~cacXxw?bV#>S zBJt0K=RN;9=l{<4{$AJK?yV2cTI;^&J?9u>j=4hAROBz>P~)IbsEfB1WHnGIbYBz- zZ4Da>{zPt}GZ}spak{1Bq-ksB^wh|~6m`eQ$A7<6~PJ0d8S#elA7} zCnq~cQ63)ae|~`5*1?>I@y|&ke8~knh5L>u6oC=)4_by)`ePKzl06P zG|f-3w!|*D2a=Ius*k%9+@LKz|8a1|((>xi;n1G*w~e9V+@hh;(NR&|Z0!fD7CB@k zS5z<>)%P=S)oHEj15s_81YSSL+Ac6MUHWmNcQ3bCgn&);=c|VJ#Se?_vJPc>n9T5t zUu3p?14RVC1}|bsBY#5D#Hb)EEBh#!5+fuyI9Qp~j}iVhGQ^nh67qRtyA0Ij^CrZ+-kCg-|v-)Xq1ZZ(Ig^;kVlw0nEi8?KzGPPb% z37Bb(xNJ^J>{8l}7G~D(-FYvRSu$1eH23Zw5r-cN3TYZ)v?5073z{prau&RGAzKx9HfxYAp0qt&4tluu)$tj_)W%N6fa z>yCc^icCx-5EB#gI9yI&8ZF}doRfowy7Nx3ug&pFjN&6Rv)TFp+)}$4bg#o@Hjm%) zZ(~KBvsCs}wH~s_hs!!S2_79C4xFjNPcU3J30!FPL#B+uFj$_8I$Yk)yT-m=h^piXB$IEgk9EjUYaK;Jbm%v z1tA^XMU*RCZhIv=cjZydSc$E?AJ)Zhrv`d@VX3K?XvJL#YmR>6lHy=4bjRZ|Ffi<_ z4y&^D+%ErBkIO?Td&QzZ^}-JaSi=-uy!onuqH51S)Dt>tU3}OYFMep9zTApc7+t>Lw;3v$*M+DQj*%7fJ6crWirfbnqH{NnT6b!6o6A>1kE*jSND=RCLESHbD z!o{U^S`ruMRsE-Gr(em=#RUs>bi6<1_Dlt~OcItz>&cU>LoHj_QOm;I^m_vnWsc`< ztADe?dirBsr2cp-l>A`0<|JUQCBkw-)y9Uq*mf#BPQbi=YKlQ$U*FEzIUpi}KrW2( z`FzJ)7B;qYMbGw*j)pAlY+kcY%0in7YW_I9m(lai8ZjV^|T z;c@8}Vjut6mycq-Grh49?y^2=)E><)?7H=FC~>fO^xoL&z~zTus3jd2@IEQW`NH|A zpO~Pd6LV=?`(9uV5wGZ7%#j@!_4;+5VIBIePFZ<*^0QwHOA{5>^*y$4eKx3}&&m<&*YJIY@YM(sq>{Lb$%+f$Q0rj;f81Q3w^x~9A)Izir zR&QivTtTu;yx3E@hQNz`AEnO~+D_5Kqqc}`R{Fv3mg~4NvzbO&(P?REhc&k2rAYsyyBsi@k1}(46Rb@_TXc{BJh+0VY;fEIvNIgWrpZ z+lz_b$)+7|d_9B1!CT^Hfq%X)d*&vs>;@bBHjg_Mn2`l5=$x zuCK4hpb)zyCpVk+RzJ&5Vh{p!du!(A+ISg>w6wHNm78;w0dJARyrRd^c8eNE%u36~ zR853|_c0lgPY-ujEelUaMs$>8dC}tH;^yg9@7%fd-V=rK+gT;1KzU6FWoywyP9da%XBs+tgX z?mYfS$HneJuS9w&udu!pc@_x?Iux(hk*oOW(Jwnk$D4Y3w0Cp$U&6wE5PwG8BXugi zw>gztE#x>__!t`>U(VQ=xjCE;_v6mAh;bwK?ujL*u+4ZW5#&yk+Q#pM)YW~vpFi4= z>wUWOK9Gn-uJ-$Ps{M@#zo@9F=HlU8gL@^;_V&^q9^w!{g`>$%0AM0v>VaNTwXYc4J>LCM7g33t2Q z745>u16YtdZosXm>F69NzZL$rBYaO2&YrH+DGI*y?)>rK57(W-qazRH55h!{Y+QeJ zUFS2bMF~TdXnsj^aJZUl|7Xt{D&#rXlFxP`{g9QvG37x>U|+m=EBkH^V~k>CeEiGz z*KL{k`87OmlRR!=RM7r8YMDFJKp=&oR&Mv`_;8m}(nHv7cO~GR2UWe=uAK!-G$J~da4M&t?t;BtQgHpf@ z4~dX`la|?`oChQ2&Xxh^ZW|e$ZDVpWGXMyqUkhE7!ghvbLK1E}wKXTZJgMpF*-AOl zzoBH?og6(wmNvKML?&0is@;0?djT8kop=6FM`n;mp7tZi^FBS{sF5<2JleXsE4|;r zQ`2afTlHyjsLava-{0R5LTsekQ%ogM^5r$lp^UN8va-6au8YOR#Vd)ANAiry_A6L7 z#>@Q1$Mtqk;RHoLxQ2(a9xYVop?>!6%T&YaaPC}$)Jg4ho!?-@1 z;k@$g!p%UuIv=!iTM*WUuzIei$NQXm_e9LQX-Ub+A#P@d4ZNG+e0>`$cUtycOmN3H zX%4Gvi(-RS!CxCIQRT&>5r9Pcbjp(gWoT|51UWn9VTH@ex`g{))5Zr%X?OBaHlUvn58RqFHC`^S1_G_m58r z=*4T_U9;r19u;(2A0-346D$6VAJVd#o}ONsXEoFqY63-o8V8%+r?{xr?(QL-S{+Sn z4-I5Jy#)okmX5bu7*;fCHfv730tgQ-nVT;orijXg{u6sQ9*{TOvT2=2KWng zW0|3F<%CaJS-BA)FIw~oQNPm-vd|`-a;HqUq3q8Fhr9p>4-OBrPL}}@K*$8aqBM+- zQX{9bzMpKMiOnBaeY?H2`>0y~ z3nc9UwfuMQ-aUZZ97NMdG)Y$dv$x&=hzcz_aE|S!VVscl=yXoWq}#%xf{M!B(~`u* zL_@Jih17!Y*7-VNWSn=G)|`y*ggXYN3Wduvcx*`{muzanS|~x1<&1b zDQoA_<}j+16IOS~1d4y&zI*oqfOAIfDx{Wohfd#M0XPc@2ngWuj!m3Z+hpbCvz3-H z&Y!om{_}&Yet950dsm!I<%3$#6Kl>QIyyQ&fPSd7;kLoO_4+Jk0RfVlS!o6}UI%a<;m;oW2L#N(e>N}mrF3*1j#oY-Ns)V5zq>k&xjz84thBTg z+ot5}SB)NC3mk_V%h$Y;l9DV&3)$aYw+)AC9rHfv5I_ss4bPm>sCM!T2yDW~$B(Jm zV*4QFt$#cf4;&>AXS2;aLGQ};k2N`M@!u!AOMN#j#~{|SPyYbw8FWi9cix`8)jIF6 zD@8>`6&w-K0tub>$+s(wEiH3h*Qc}$ZtkUyx)NOL;==AL_E~DP#7WeDrZ2>5oKFdd zeDILg)I>u^Cm5*7y)xV8o1259_W~j5E($YmG52nx6CFZtcwwqeF zf4TjvKlxTDuhj@IEH@Ji%gt3U!HN0Q&)@tjD%{hK2@k)o07J;W`-jcoo+-tc)NfG^ zcc~{z-@Mf4s|W<@bGZa zv3qwS90O&CA(`b?c#TGd5+$C{Vqsty0yfMmD!L^tjRxt5*Kt7^>i)T_4~iRCb8Esi z(^Vc%NF5VUguYTi;MA2X*pRD`?9P`lVXq@$avpuWk2|E}opAzv z6;EviZN@2)`e9jsAVDZjhjEGWtx@OGKnrl7FtOB(AAF`N(7dXLlzm7bBsZ2hX4!Jv zAk(NNKK;tqyq$5i=vcqd>N~Nzx;p$&Eg%d+5)!`$B{oTUd4zEIYC$l_v--|&HBqiD z;jTkcek75VTf-K(RSvs}R9?6!E*!O6R#sg6{QR#zdzW)MbA{F4gvPobf> z^pYM2%9x3X;_)-Hv&m6XlJCTSSEr?>T1W&O|5)wa@fr?bQ>~HobW#2B1zN8biP*gW%ArjDf$pu z=0eM1{LRW=_)uyyxM>Oi9#o0f@R<-^ir2{0LevwVeu7WEw`MtFZ~(%QkE`nF#FooV z446u`%LEkL)=}{2e{;*D7P6xB>DRRYVn$|_ej6a^w{-q{x%v;6tzRW*LzN#)FS~9t zer1H4lM~m-$Oz>F7+1UWoZ`c0{8wTgis{NBn|hIw_r~$mX!8$P(G{|0&%>QewT{nL z+AB?=6qvTpV_v^DwiD1(b^GaR!~Mma)Sb+R z@_d_W|Fg54;;cOa0P10q3XPg>3IM|2C`EG>z<^pd?%y9bj*N`p2mlPglQm682R-2< zJ{wfQ=13NKjImGY>2Dv{F~=wl7ATOGJ!N;If$VJd!{8;GJ zQ0~0SQK+7&<+_?v@{Qu%+uiIhWaQ*<8`Wy3-rw1)_z&F(oO57Jmxgj^eEPRimJ{j& z0s>s;US9u}D?bn$J=mxOCV^e~$1 z_;bxK8=xf5EH8%u4{B^`nt}ftDYB%1Qr=FdhLjh89CJI%1817S$~X2(5VMxX%LLZe z)?UTL1R(STxYo?fOy1Y8^);u*n@?&s4Lr6n!x^N)cXpgAcZVJbdmah`tIMsvJgO>n zoGMu~T41hPdAO22=oN0hyAyv4GKPbLgB{f7&kssej$3j;teJli`L=J^J;QR!`&0t` z+&OiqySJlQXJYQ@*R@8n%uG-F0NOYC{d0cBlKG~LjNqj9BTy)GDqZ>bLDK+4580Cf z2z;O{b6pJQy}T!b97n%;q~Q6^cYhE`*?g;WnVg&)`8=W_t*u!hDrJFr&)q3wZc$Nc z&`vSc4(IO-ye)30mGYtm+2NYo{>DqDn}Hu8wIS-8OwtWtv0i`HuYVA>M@IpW$iUG? zA#4m3VkX!z2CoAqB+jdgyE;49;yOmPv;j)qc>HR##FhaE2Zz^f$D(q7(j8gPpTBi%V;3Yb``sp4;vdPzlg(W zQ*K-e=m`lnL|Y6ZV3^&X@*WoU`R1FGD$YkPHtM zA|Jk7IgTIGp^R4Qcqgj)d;7V`+H`6F>$xr*b*5zIh8yqV)=!Hgmh}WBtVkbQU0FW|f>u+>lvgMm=Oci%XPjb8TtdPgV$gg=Qyc2x`RD}hlOFFAKOhd%P-b9f zRLa)5xw%Iwp9x6*Srf=9;+XRIbq5gzfDXh8S?iWrZ&vLm3~3ms@~6(JgC2cyFh>A- z7Xx*P(j{V#pKWZrKlj4Rn;8{iKv1oR)$@ZbnLgN_6L#O@LWC^P#z8r)ZEnT_y%zvP zLvPYega+g|kGA7c@nZn;Xad0%k+?2hzFY@@5@GKM@7^XiG`;&pF9c+&qn{l-i1r7F z>JdMcho>i>=b@9+>d?hd@~c^e1i+OWpf8c*r66sp|!6?;6efsy*F5NP1Eu>LJn= znQOlI50AzNijrszFFnf2+Ij{w8kKlarc0MFKo!(IrCeH-_*R4n_`tJu zik^@`1r9DP6-L%zaFA@G;%N&Mf?znO785OS+U=KmnLPF)0am%) zOrw25_r($xQ?@5TA|HaLvALNU$}FrR^0)|o%+luMdQiL;w zYkSOokoT;6HXHU|x@f(%u+$m0_ihC~H!*ed5~QG_Uf8IV9cswNYg+t!iDPU zYNVS1dKS{$V3d^9-;e+Di8u|X@AfU>SIp8*8bpfGS>+o@^Ka0^%z5H@)*%#x$XBmW zK*mZdDp;79X5#jcd}lS1hXydS4$7IWxYrS-n9Eu%paVOIq=SQlrP4+10!~SW&BWJp z>IuX?m+e?^>4iiNzP=@uo*Z)SXM@MdnhGa3T}N5~Ql*pA_AlMlNr zd<{W;nVFy0PVq$U6SdVR&rk<~>G1)mUWkMQJ1_oAd9Hv%J!|WPT|_V*gWG-0Zy-+m zjxw}CMa>1%1Dz3l6j%{-M4kp%1L_Ta@j?bVd#oxSGMuHL4IsLz>7=0^5Le5Vi4yO; zT9@xndr?cLn;}~P#zYm3GBj}Fg z#38Bmdw^EaQ1JYk%hmJ}ZvL-d6UXnYy1l^gF-=?law)Y|De3)Us(Dm&wAS#$kLh;G zcmyqty!Lu{s@7qEvDFvjviRXkV`4B-paS=nR;_J+br}T#ed)7-HP;Itni#3YKv_$S zVy7F9K`)rzJb4Q>Swn+(d}5;Gp~ekoh)g6vL*|TK@vN1VS+Q3%oYNl`EpI-)K=7Vc zyj$~0rAl6Bywzo_AoJM*di>a-QF7);1#{1jS^0T+y~`t-UH1RHr--DmkUlQ*%*=MP zBaZicpRW?M_*eEYAuhy_}nK7j_v+Fr%vI>=Uf3>Stf94W~N`!TF zGmwK7l$1F452D#MQ7BvrZmtq5F=l3TAhL*z`_htZ$X}MrH?cw_usp`l!I6`ez@?>T zY~;yt+|`m_(=`HAd?5jDE0>WILm^k7^9Z;+l7UduAg7>w5Iq9E3gsi|c{mL{9CXxY z{VHk%(z|a~1rbOd+^C9x(t;pH&~DE`!zKu_G9$D|fXUc7IMm-W@Jw60mlj_XhaKv! z#P@9g$5z&KI{!vh-5-(5lx0$6j-n$oe%$zZuk4sEz)fg(Tlh{RVk~sF93q;Uo8O3a zJs`&@`VC;Xx7hl@5vb}&MS!Zo2dWOzHUbSPv-?jD?m3$KOstJ`w#}{iH>b;Qkoly& zb+lVDV!C@?IO!!xs458<23(lS^t#>OQoRCj3@w4$fq{bhuMRPa0Xg=+T^Cbn@87C# zFk?qM8S5fCwXwwI>*)LzTg;bKUiU9)PKb5)Cw1MS;w54u--wIJGLN<_8TI_X=B4hO zG@1~6nla+;=p=+|nc?}$wRUvlRRosL7Hr`24)goA(390al_xH4nP1LzMEneblo&6g zWx47<joBU%qU3+kZ~wWBs^bk8DQdh6h{S{eaL_%%`#l~omO71V@pm&ZK@oIyTUa?4Z+8( z*}|w3vGXfW)+rco?)`n-Vs$hf`)lXVxv6MC!iC|CA^&wCCuX z5@5F+Yf05aSt>@=!DD_ib9N1iMkG62F+IW3%rBPkDTZJnsa1*^IzwWYjG8M4vBQhq z5r{!%ehfU{T^*JO7hLe1>nJ4XjMzPv8Iy_0w#MSZa-RPD`1hOI3<*CkHZl|G+nNWZ zq?lT4a(RrEaRuNEun6$(>R|Efm<58R;}S3SyWnD#1!3{_iv?q2(qnfP>t}z>l(1ZY zHC@WFg*(@}PKj|RI8E07w*+mzc`2v(x_=LuRJ$hgj~mpJQkUhu_*BUt1_tlp`Led= zKEIsoYZ|-6qGyJMy8E!B$!DUiH1<=kW)IX)g+%zK=#&7;Z1fM?t0;5f`#6bQC-VAX z((W30I~qcY(I4o*cao(FAJfY#xn?XgPBfxW+Nm5#9DDkdTrMD;S@ON98GO?UE|#>+ zO_~eE7hUS^RnZ!yrFC|;UBW!vP;UP8po~l4CfqgwFE%mi21JwF12I{ww%+fz;G0|` z;h?l3$3B%TdMQ5$o-frBOdu*snIib%P7Pyp>r6u$=4Sg<=m#4La*gNS#K8{p--VAn z50LeLHbI+bTgq9v9?(O^HqXf?$$Zf^F`6Ye_ErY?XTmP}FNz-!3lBm@IG0Lr88-scC}i?+rO2#+3~r;v>mcOr<{xMkcJ4uYWkb zKtB!t{lgTD+8Qr;lu#uvq#I)MAWhf1FliW4!#0nH5 zs*oFEFPZGQ|pN|~zNJ;RxA>TR=ms$t+^l+ksy)O7TQ)lZbE9qrD z1sMoj8cK{z6}%@_0kMQszVEQ##;Ird*mBPsZcUPozxumrU*N9e?TcSe52JM-)EVHo zJI=&hq8ND5YkQC1tNhKm%V#^XjOWKV$PN*3`>J9`=Z-xEw}7S^dgW6Nk~#cOGmwNK zoi1beF;e-n?wO4UbdsH}FQ6Q+*2KvF&8IHSXIE3d1Iu0aD)?6HKp%aOTv<|)7Mvux zBpl7~Pjgd|%dJ{@$WrzdfNRWepsfkLMXrN4C1;OczyS)ioX3UGzSkex>wS;^u3p>U4FSW!r4+u&7OcKH{vB&Z zJu|r^jD`3%nbzlm;;rmMCYz8)!U*76WU68UYqSe5dfbI;Wjbtaf6U1otSdJcmx>=d zheM8R<;8y8M-(c1pz7{UOLz6=12(u?91hthx}Ms?x#d?Bf1hrE5~Izc$Wf@JuaCde z%J%#W=lSQ>_GzYO;ZhvE52lc-p~2OzDTQF;x(OQ(^WGCIj=?T=kGQ7_5 z@%XS!9`h90w(nr><43%0n_-WiAf*IvE05GlGZjNp;(b8hOlD~hl4uauU#9-gyqM`+ zCT_ssET8>(=wy{&LlSwNKkq`X#U*nG%zuH>uU4++EBFtOj?U=3!lR4t!W=Ut%jBJt z#nP926Tah1EU^dnzziGYu7-7=gx^dV=PgNr?H8NFovvV^lBYjgpg61)9AD_^H2{7G z5RM7GIiw8>fM#&C1==UulkS^Ipy6$8ZQTIsjPzoTPX26OVV|*5^E725nu)RrAGuyn zrtea}_=4nD45d5o#m&a{$a|g4-{92LP~9O63kz!m3Qq*S4?dG-u>O#OTmzze)6e;i z?7|3;9}fQfG1*-i+}hb`%+9TT0bQ0kfS9i9MZ?fK>D2Wi(;7Z3efuUZ?%TMWWyQ5& z$3E|SINYfmJKqGL@*b14%{%zj$N(OtM3ewvsc?jzhtr9nqd+<%qNEJ|v%fh9h6teS z{pE?!9mNGLwFzl0nKa|=&VlT5a&iJ*AT(|eOP9T#XhPqHn3#BadHDhm0nybUc3>!Q{jT}04@|E&@$r!!efS^ZK!1~lBqf#zd0yZl!1;xpu`y8c^75VW zVie$Kc~ESX=_Iwiy^Zn#+!*by2mUP8L`jqn=!z50AMkWvA=UK}X*ck?R z?$7Gu^UyH@RcgB8YpA6~iZXSw=mc^js14(c(!OWo_q@VqzJJeSL&uh!KbI~=(PyhUnu#w2g{D!x_|kaQFZnowlL zWfc_-K>6+M?PE5~p$(5sK+u9%e?SjLi~-OzZw86O$lCfPn4{`Pb^ly#_vwE;F`BG) zrTzH*&Ro+cv!(Uf;Oi=8bH?QhaW&({hGvDde>Uc4SLt2siuhk_s_zm@vHcIpPA_gr zEl+>{k{R2~)D#T`J`X{g5@=DB^vSVCL6Jv?UN|5%4A2QNPKy5TgJAp1v z#?U=Vw#%0XS57jSl~CEb=cL$TO+0<6qS2eo_N1=VXQn!N^d}Q~b?e>NdwU@APx4;G z$4VT60nME3p&rUxgx_L-$siII(3_DOZz=G}`hc$n4tD;JHkJ*Z1$=!A%&mv4}W0!8XOMSBiT2^eM_GKAvt8)U``&Y;!FP z-i;vH27nJvf7NRzzr6(-i;QczC1)CZZ?9iPMQJW;S-7|e8Jn2&eKl2b20H}w zLC{dpRPUFL$dt?zpqO))VmlfgrdL_0$Ccf96l2&rN$T-T61Pc_`x5NYoDr#XZ%+>a z$_})8oH2X`Nxd&GUxCF2bwgM8UGqAKlqi&v(u@Z4qj!Nq#kFs^e^ah8dDP4TI-U=( z!W(<*D;C$}wnm4PD}fh`Be@1rVD&Nqr^~$A8XV#6^^z%gTyzpP6)mZR*w|o1r56y; zJdQ0W5YR2Ocn-mD2wZM{kLPU%qI{uF-&0+3dmW0I$5cx{^+&NtpTJcp4cqhS<+-;s zXZRto=Zf#gxHu{hNU=b00X4)J+-e6)DG`Ww6m;D==lva)$o}uw&?1!4Z=OhY=w;c_7GM(yUQN6Xnj@)$(A~^Rows zA~=c8JXRp05|NW*feX`Pckmv_yPj8TaIy&AiNrE__|r(P853N+mSNABgsO&8vFTNP z#&NVR%Gu`{X&PJXGhSLZap1o7d^wV;G(PRPyteJCRbIwhmbK283l(pr=N22cK(HWb zY1*P3jor#6R=?=vTu{5sc)hZJ^sZ&al+An{urL)iiod`q26}^8Ob;CSs+U=s68#=d z+gv@0e9^;Qm*<&zaHL26;tcWOq5K#l!|`m+aFg7#3#z-E+O@Akzf*p_j#^qs;IH3_I8bd;8;f}h))DS1wUD>rpG=xV@E);8yR9*Sp&pPrv&7QcljkPV{6`fpp>#g5fg%)g z`;4b%d4m4uW^Bz`KRc;m#W3p|C{)i-^$(u^tH(@6y*}U)+_yX2_l}hz3umK$DNEy+ zZr)3r+Wrm^?C~hyRjRWaD)N(lm4PpO|3!Xv4T%?yov&Znmg4J<#fAU|75HNbj~evS(TJu+ZmN0Xf}~GXK23h#`*v$6@{#e0ERy z-xzeC0mm$4H(0jx1Za?T{FszxNE1MOa9~C5Tdx`UO7nBeB+!O@K%o4@FRd<*@O{HH z*m3et+e-5bR@ADsHRms$Z1a0j;0MeS6YiP_kfUo=JY9JBo206OGQ*1h;jJbx1U!72VK5%BFS!GNc~?(_#mrLpqEduLl&Dc-nTW<{`UqVmlHfyGSTq$iGfuQdUDE zgzpp>nzq$)`ky3xmmK*N^GHJ6^*Cml>K@x+%ze>Ci`ha$&hg$23NybVj; zrk`G3VFam=Sv}`%mGmo!`bT4fU|w}kI3thh5nJ(tF=qbOixYYQ9i2bOvw1mn!*2%- zM7}-);=2$hL`h{SBK7j+GsoR~uPoRF3Z#+>JC$8!{-)L<0O7$X)Pgb5**lZ+XqDXcqued)gn}?T_Q}{OS_q-WBx(n-^GCx8bs5s%4 zvugj1f)kL~f4@vw%!6t0p-1M7$y3q6r-%hR3ko|s?NmAIgf4STXXAdSEQdTQ@HdZU zvBO9D+q-mqyPkLJEU*RSyx|vE)Clc;YK}8>5_OhG<$U3uIWB{lPebW}EM`aVK$lG3 zL$44V+xX5#|C^5$+RyUnP2kP4+C$&!@czh^8tdOX%cFJY&+@3mK9bp6U%}jwdQoi1 zE!T=D31%Rm0V#2Y6@@aF^~GO@Ur*+L)rD*<7SQY9gxeJNaA^q~8nSb&i;BmbI0=Bv z5WUyRNKYUlQrCtr>Jup7Bbi&3)Ucvi^Tisf4~lXJ#WY`v4ELnlKiMDmc5pBrvLAyZ z9|7)Mh-AtLv5TWh!#P;cz{lFWQ+HA1Y$cH}>%U;mdBKZrI*j2cE$=#o{%r@o9PDG4 z%X8_LB5G&*9}j7QIi3jW^S5N!PKKwDZZI#`x~KE=Z5UG1BlKE5-U#`OrLk?GM1$V; zlnp4F4$&e%GYFx!<_RR6g&I0?LpUKi{vvei_7>4Jx=Ij7k4Uq4=Pg7}=;FeaF_D{E zCP6-yrNT`vETkJqWnBdOKc^yjBq~G(kzlfrv!8nXSmHAw-E4UNogB=u3=~(zYrk3v_R=?X$XXX6YrWInrV-|R z9G}lxdjX%k-qh?%Dn-)imq(vVU!QGAkw1KBrO_3N$oY=mlIO6D0mQLSS#g=p_ir(# z>mXVV|3x`6jGlP0AZYnEbadb%T|GeyLV#ddRf*85^8hy>GHHeIjKas7$V>K6c|5pO)Sm$WB) zpdHx@4b9K@3$St?lr$rTIcTWXK|2^s8AS&;!3re6dru2X5K3R@Ow271m8oVlx{aqS zmVkd22Fw0i>cc*y6ci=^=?FucMk=22fY$&UKkj+S*!;7G8jce0Cwy zQXBNZ)J{JmeJk`WsWv)jIzt0k&o5qb-BjlOfzu`6Jv@J=qi;Ir)rAe+eH9#_)UUQa zv-ppGSVTxnjCRpz=nsdoy2N`hlZY2iFYy9a51G*-D=vYN1eiQ4^**hhmIR*C3$1IY zSiw!kh=m|l*q#|i?2z_7*Zr>;z(kv2=!L_RWNvm=O+!OtZupk0c9hm?#39WeQeUcfVz0=2Uo*IU`vNjs0a#wMYCR=FvK}TV z0c-~%-D|Mn=-ksv&?_e($R5R z@wk5$$~|RCerI|0TFYczxNDg`QDN8qdKixnwC`x2we|=SMG~f9`$G2)nlmZwo>Okje*2 z68v-M^-Nal96q`6QSa~t8Z6<4?-~cMTm%Xz{q|C)--mv`qVAv#pwIA9=q(f@BO?lu z7UDmG+^MFe=%F=4(Tj|2Sxyy+5}A)lF%&JrJKO#VYy94sBVq( zvsg|w4%1A|lyTN5saXgnw(82bXg$4hWJBaJ+KL0Gu|;-=HrpyG<|^(q5hvjszR zsCJRdP&bp^I`p{eG_aeTGa_WzWk>f`Jy&Qi6&!=A%c^}#0 zREsAD7(;DLcKae%L<0E$F*G37HE7}@gS>FS&`=8t3*c?L1rjGRXAJ(K)c&zrXp9vT z=?^V!m_3i{bD$+G<<67E^mxWZ6GTrwonGC&Wsh*Yti*)nr9H^spC49`gMm_3SC`tS z|Mf*xjVq+cBn%Q2APmjH_>%068z}u}^JGJ-EXksm3*Joc-!Qi-jSC`BHD@l-`PDh~ zJ(_uEA|{N-P1IOX3AyLPbltPb5~Vj>b)eP3+!Z!>l)K}_K)$43nyeC|7PLT_o14R! z5@MzRe}AdVIyE@`{6G<#1GP3dEKD{CpKc2T39aRmS}6b0fan!DAM99Lge*rc3%}P$ z&3~w$kK<_kVYA4JOIx3PcZAi(##B&K4eHn@G&vsSd)86{rETzrmD`N zq7^SAsuvj64PpKPR8>JR{wG(T@(`&%v{*0<+#<3$*wYXqH)su1ySQ@h0_f>)e&6P+ z@6fG@zr>t%c@PJ&3Lt(E_+nIksoSb@qcI;00sDg;5B#R|&{PK)AXl^}4V@LFi!=5e z&5!!Nk=r9I8*?&CtGBuz8&eyWfhu5yx{1_xe+ZxmZY|a3#0DXTocMN^YA6Uqh$tp| z`$G0_lE|5*6|?7a(l!S_tjGcCa*k}2Ja{G?sS`Yirn1Cd@XdrEB-kli*TCGF2;I`r zTUPd(sU>&wbEUucu1@anJAs#K%Dc`B&yG?2$NM(5X41`$6-6fO_a#8HHvLQn__-VT{ec&Dc`0Vr`aBW#Wq%mLiF?@3kZpcU~b) zPEOwW9St6rng1ea{61WzCF&D^30vp~zJ^}V*xz1I%$u?9w0|{HXhcZBz92+9`N_&Z zC5;?l^frr#ssqqw7sT}BmCU`WKo}C<=~QKX+mZO&9HzrV{@u=Cp8%@vfBG4=yqaVm z31`t^Y);h>L@Mlba?ZI9xIohwOXw@CpV(wrq)AY%^K%ZA63^mvZ$m%xH|r0byJNSR znb8LEeZj5R+SAhl;$}DugWO#;U+j$Y`|#nzisWUYNNLBfq$Zx;YzqCsdQ0@_1|AA5@4n#`GO~}3 z{!L7gANhANB{^M74BKV79}|4>AQa~AMdi;*0^bEzaG!3?{!t;Gby_`}KL3B%9QtdX zBvQlluT=HHEG{^u@BL|m>DcG5iSLfHF%R_qbwA`y>yh8pC##lpvjp(@n)Dzkmjs%nw1?F)=k|t*=iAr|ciW$xUpnz4?5sN22wrw4E!* zq=k76VHTAQ-0u2}kcrQw@BGjzqU z_=a{In@7D=>VA{N&atxU% zTdtTGnvlMbC+jto*y}Q6F%kN(8ZUlVpOHB}`z~Q&Nl8oPvtPaWCh`apgruk&<1Ga5 zq=4>EoVfw(!3~SX1iF zLEUQ}U<3!hEe>Leegm%x0K01~@E+(UA#X#No|!>|nLiy}UGTe|1D|WD*Rg1Kf&?=2 zhPZNJ>!A#^z*{EzpuUf~Jp{KvN=}preuP5<#p2hzHAQ^?g=27V*mIN-O6@qycoT3| z?L`K!q(8xwc64+^1Y~6XPRMG+2e8K+^jDDbb#l1MQ$@}M;%_!O*DL#1<| zmxTPae(*I8;N-@)-jE4j;1`Ja8I3LSDiB(NvjaBac|r)2cXJv1!h(%bQBjGNa2G-b zijndGuc`q1XkC^2{wtW!1A7q6tJNaTPf=%IfIF~;sO#6SKip3Y&d}ffD_l*6#?0-D z{q|i){3H4D0SD~07vlA}c1CW(DNkU>egk}ZX)v1t?BZ$JCxk?C;_|qBcQJ)cO7BT7 zG(KWXQTx?4>;0pNvXBjv!NtDz^PbDAU=c9|*+I=YQD2xky5Uv5RdSb^*-fZ!awzzM zy}Tg{@bX)Vh9^_^DajIE99SV`b0yQB5Dy2Z9{j0JeH8F%#s90}MEg^eSSSp}%!0Qz zrWx5L7ZkkiTrM} zmf*5pxzh!h$7zIVBjg1H@chizu92a7i!ARw6o-s@d-bm!#>16^q-LU>#A^SSRGIKe zDIqkCdYfV(Lu81@W7P2juaX0AG#@e-Z-T_V$8Ua{sp^-|+6kkV(0!|VTlUe{wXP&o+q$K zb`vsZvve;E21ARf1^P=#8kvL?6hWN#zDz?89odyH`c*HGi331yNFN)9i>ctnE<33e z?=e*{I(RpgS%;#B;pHn*rv@*Z~fLARujL zmskE$j~lUu+rc}2^vWF4KEYM&Ggj^jjE23m=^T|k*Yqff zPG}5dOlI2?3JM6^gdSO8ain4zHa7MR_;m*gad1K(uPRumbPVr8Q%IgaNZ)XH} zlNvhvkh~Ma;SptGo@)t}hVTe}Uxe}gTzr0~D$awere@t~et9Ih87juh?2S;rNa>Nfd738d5 zFSt_!azkW8W|Ys%I7);4NaHa>3c)<`lV5f`=@=|~>BeyJ0Q+4cLff zh?x=ngO{J57>0_QAcG@Ce0L58X=BBna>JxXvDJ5t`D1u3z$}cggX%)U`>6HR=n_AX zdR1TJB%XU<*=)@Ot>KsXT)EGBzG#2uN2m~~zPbMY=#0p&thqkrK?0qAsMAAt&W8lA zi2SEmdqpeOL6Gh=Tv`5;FEob&zclYt4+X)D*UHUpSn8DsMrYvr**(?c~@DvpziB3r4#aXG~@o zp8h1-EhXeU97&1@+vohkdGz@oF7ugN`tQJj!1bUP2EabrS5#Gb#*lj=9(|aip+{aB z0G(+`NAJI8^UEQw}EYWyH zGV)&y4Z8dJ&wiOfrJRNt%7D;NrI&}A6OV_@!CRE{b}=dI%yEBv!qNish}y!h>-5p= z#K8Af_CCEQ3t5#IHwhliM%myaO}a+rtJkly#?P5UXSzL>vPstS|D$Z_L&Zb!nzoa| z%gc790LGxB;1xiW8kwZv>=E6NAW}cRJ~T%Bpq#W;l~ugvqZ<0J?ng}Em8Jhdx|;t~ z_EIB=x$ts{EpQslf~rckfpR$5GIFM|cH3JE86O|ViCBbLM3~jG_?AJ0g-;s{d$all zkQi&~^B5d&e5cxSv8IEyO)l)K)Zrd_*fsHO2cIHH=iYr^Dv6TS3RmEMOXG3%0EQBs zHYe#K7^I+o=wJTE(9Db#h5_L%XPzHK9I@c7WoW2NtgOh`&c?=u9h9H)O?a0{5{!&v z?_NHKNGnCyQ3$U|5d*K8Zj21#6uAEkO-koA61-Ub_QPBril1ija`d2Szq` zRtAy5UY!yfIvBhHj1vUP?2z+*aPSiQ8)72z9lORaMqezfIEk$Gd2=!9pzAR6SOn+A z#gj{9>Q%<1>4A4blJ5M^L#djYF!N~5PG~w{yY@ z1*OkQ0!SI;Kq^Wdx(cb}CZfpB zER@K&NIDWl)Fq=bv!aZQBC?`F(NN+lA{hy{Xh?p~_c?v5q7zQ2Dxj&nZa{eHjJ zbLf72xZ&1fsjkXJLa3BCqyxom;0g$%q+|sjCdgN=h(*l0+hWi-9%#BznpBptk?D&n zlD$U8w%HL>Lul6hqL_0%Ez8$74QHXrb^jwlFxQ_6!SFO~D<(SHMwzmUCH)`X$QyJR_ha)(*Vj+Xn>S8dNL@djO;Fr} ztH_nh&;PD1Y1-)edcT8!ZA&7*zXzE!?IV-FP|j`t3uNZDk^_;60pU(0agawSZMG7x zi`X05k_s`K6+D)@?$V_gIv!Z-L4x}^a#we(!Di8Owhg&sz4p+-c^JX$xLniZq)bMC zMo81?(|SpHpB19_4!Wg@JRh**E{IW4@EZYsLOHGqXSQ)QxhI|K|I#QdqQ}#V5C5%E zxS5rH#bT;}p?vVD>09r3eK*^*)Qu=(nuOnu(N~{j`7zEjfEEFel|?!}XJg`p2KO#$ z%V4zoNanJQ0JGJ}=qxn+T+?paiy6?yz6WNv&vgFj^?*Y1mZ5FTIydX_wA9`JpS z>p=pPXA%r#pq|==h+co(I+-n|`d*9=5~|sBy}NehM2lgNFbglBzHe?Me{oU`T+8+8Y^~Hl5b@SNLXWZY~GA!arOuLi?>4~rM#_1z(7L& zeT*qr@M#HV})_5vAVBBM7|a5$7hxSe%+Z+X%V+L_56a?>sh;C6i^`B?unmx~>M+up}~ z%;-E2UuBk~eui%9c&&G?)Acjet1&hmpGp|r5y&UnL~^2St2C!O?|vNHHzyjGqeEBQ z?3aSd0H43h+DH$p7rt5T&gvHNHI4W~GHKnC+n<4@%0XD;%Z$O-Ek-!Wm(#Fn=v zOJBS7adR#20}1)?0~t7lPUa}j&sP=N^$@RvlZ>a)Lp=Qh3no7PZx#$TkdP?-VuXEi z#g+MUQ%q|kX=MAfXg>3#E#{OGXF?|(Wn33LpeaV z-}$ZE(+u!QJ6n=WKy;=y(B$PubR)j(Nt9kX#ki~`0bH^?^JHta#qrvx{K4HTXTRB2 zU$-VW(N~9vKmZFXpu9V@K)On(a*o^asC0yoZ|#b$tL*9U?T6g-x2w)NQKEyMCX)zT zY`l~q)4S>WHCdxZw;b`HS|Nmq1)#pyX;F1a^~#Ugz;A<>p&XyK{zbywwnH*N-Dd5i zuegCTz;eAW@e1YqJ*6-Vd=gCbo`?S4lYVhvv&`}bw%m84$$`Vudh{m%oU+?wEg5Y> z3~3tb3p`wF(e1j*KX)CYQQ$L~Ci+_)0OJG|hL7F$yqqne}=H zR>2|qcDh5g{Xm0hes`s559jkJTHY$VlsH`ZadHNPa^6AN4iJWolP#VlCQ{v>T*BL8 zPu?)aMrkG!qR=~4*E=?kPA(Lb9bOcCDjs;YM^fw`NA&8CpC_Loz?slH6oSaHl9G*f zr$x$2*<<7HuGndVvJj`>q9sZAHQ#V))in?9&j|4o`>hW z+WW0JT4FWm)p3a^9ZO0QC+da#{BEYdAdV&WMf9dBu7E{wH0>0Z^VVf{|grV zK^Zrik3@O%m8SKx9#o^VD2QVWm5<=SD6=D#zSS~NnS}Q5tT|HOb9L+7L(4fdIu$kn zk93+s%17r)LN{e+o51|}tjP`%_x~t=FKK>4yHTu)zD!m(QF@rCDYmdUsq;?{>FRQUV7KCy9k(|h_!~&@B!*PrP<@eA{h& zY$MLz6l!+_@;^B!XeLF(9eO69RKgy%UfKiNRFEl{jfg~~h0!s{{!-YQ;B^=(AOgO> zEyz}hIMLlrSj}5H{VLBZ9#=B3CgU~xeZTCwV40=af5N4_rDnF(g!{3IQUh6`tX-V1 z3it3Yvhs3;N)}BY3a^@I8y9D2HYaIBLa{T;NZRGaO)(-IW@?4^JGh&H>e6!J76x+`NuO;iaTO`^9?88Cfx}IRRYXBf!1G$| zhhcUAh|AaYT5e<0d81!Pt5ir?F89%XW|}RU5tHLXEChdo1|RzIh8l^`(SZmPxl=$v zQa_J@qfL6JL^gH8qq}AnAk)cEr9SfY(=rrrm##fskK<3!KOuUZiuuXpr8SunXQjK& zZ&jXOzIN8yURecO-SC&fL_8Mq2%OF8A4GK`L{C5?u?AEKNY_DMl1V_;BoNX+Ssi#B z;utnmLyBe$O3f4n4XnAmyfjb-2z7=mGB8*g)gv!nTx5A<%Cq9861Ho5pNmv|-pmVx z!}DDA?1?v$fXI9!hxK`T-^y`yk#LBQ_XRF}A;rwhyiiR|6oea1AVM(NCT!p){4O** zuDpL;c=ak^5i=;X5^K=&kuDm0|K5(C3~nLq7Y?(Jiut5z>?Xg(L+l2Px2s}D{JPWI z1J52#XlXgV)-0AXG*Q1vkn*=Oj;b?YD*(~+t z-L@D>|JOsutPeP=zkjl3K3S2uJP7uZooBu$pndGM{+@v5v?biGh#oQYjFt&EPJ z4KFF#-7$4=8Nt!=Bmf^g4t6GxyKIF(5gV+M0(tgm$SF5jpVMYEgUKCZcjNW=uQusb z(5nUQ{t=tt{<+@auG0SM$?R2EW#}+!XTYd;h|^;w4^{O8`Yqk|Z}b~iI^e(Pj3M%b zHVBQ_2K8GDeOAh!OUS%`H3OV(X37aMVt97dqx+nyA*CdlTf<~(t*21S*wSAW@m`$hm@bTcL--AcgM50^*-p#n;KUa zR)`gK9@=kajx@F>lKuUYwb?@SgaHD{+Q(*r%Ih2_BizCM!t1S#B^!-9t-JP-J3qgN z#EBGo7LzJH)X~BV*o)f8#iNVT=wTUc_zckO9&p&Ve)cO*U(POgq15;*?u(Y{F>i z%Uk|S>KEwMsusBS&MRc~wD|TtLp_$e9UKfjM6L|~Py+XFVO)@msO&8=CspEfJ6@RX z)_XnPln|Lm?`@2Uf3TMEtc@X9K++dCGGJl=YQo!+=mVw)f{c%Na=9#SQ)!#E)mOD< zw`6&AuaP@m*^B*1^1~OH94r|w|@f=Hv&Kmkg>NsUPK1{z2C9Eb9u=}^%fzpW^9}Zf7wMB z6kgr>RclQgGL|wvlT2=XuYDL8)rY+KP`iE4ZE0*$0}mW$gZ$=uqD_DRafy2sgv1Cx z#tDR1_hQO_$}>0d+vV6V73l7~KPME4Q{Nm1+OQ!I(bmkJ*mjw;h?|b@WED+p!Zr6u~+KVT}ut z{z(y>z-Zk|UhMfv5zNK7|K&(re^Lai(VGeACS0o!nwaBVx2#h%+1`CFutm^AkoT%9 zY|>-wdU@ZU7s2$bU3*-|5$I`--@9M90z3Z9{RS}0dZ~M3)wix)?^s<$dfP9z zW^uV5yLf-<0&n;qNhd!!k_dV)crjyRtooM{xb}TR6MaD&RJ;Or_w(0nWU@P_58aQI znLrM|wfo|u$fC<9t?=z?&i#afl40OLPwhkXXt4=GUR<7aj-6uGxl3k<`5Cdk5ZltW zAU3n|ek%{-bqL|^3su^B?%?p$CD3VUqc~r+nU5kfwJc4JG%_yh%-w%xpbUXK!uXmk zI~^ot-wcE%XjweCqXRhYhW%``+8uHuhnBJrnf~Wfd#BW7dBh}ZKRUEGHvJ8C_!`Z4 zjFM5y{*{1|D`@XPfAOJTUwGVenalSwBJg+&&6WR0fJ3Gq{Yh)Ln=-Ij5^k?msO2J<03M6l=J7Z!h!##x$m)9D}U z?vKL`b5i-{F;tVzY-r*?315xQ52?I$bp&b2LZ&OBp{(?h>@-r#s;c#5Ad2R9aBOVs zlPMQS$xeT2KC&8=UV=-PG#H3}3$Zy;ewIUNjNnxFO}=t6-o8EX*fC}wU*DRBhU{&V zn(2SlEx_fbYOH;=kw8y4_8M1CT?jcfA8I?2cg!XgLyGVIcPu%8Q52Qr|K|x6W~TKR z@0C|n5LX0A4GpUgs{(QNgT>!B3a;F5;#=@q3d&0=UUg3`CRU$ZIDg)}Sriec7+fW6Hw<}4|5K{kn=8D5*Zn(0tOOcg3CMjT1AXU-E`iY z=r@2)4ZT@YV49%jtVo2#*2IHr8ThL^N4241s&mERXsd!ysy ztz~&MTDy$ErT;+3=~E_#Owyo^fpfE#F@3c6 zjYu0?_Tj=t9wm)z|KBhDx@fJ>ptVkZ8UA^vl38p*0vA-X(+|UUX5Sx{&o1{wy)XE) zp{9l=Q_jiB$gjWM?6xENTYcJkyACGup37z8pD~($Hn}oWY_YM0eX5pDfl1n%_NuGS zvp~3rJ_anDZl-2MMFqMCv+|b6`k6oS48<{`-K;4o(A{WhNTsKYY_XrN%*wP)IJUlO zGc6g-5UPuevP#5$@xN{^BJt$O6QUnV0E z6h^}rQ-1x__R2=48WoG_rhIvzHiE8QTMKy8aeVfYcym(|QG_E<77(WmrhMR{u>X0v zY4@5@x;9hN;Rzw5?CD=ya$!ON#`}DY0JY0khW4)}yEFix=KAlXpN?=Hs!Bij)>D{i z#-dW47O-m@V^-99G^>;0=N<@&=NA^%0camjxrqKeN-oT;SmN~v9oMF>Dc-m}1P%>QpBjVo!6HCuG`@_^goa-*w40!O!jzkv z%Q%2SIYSKoaBhnpy?L4t+|jT256_*m zb05a#N07SW%!z*$wIiwJ57tl!0pIybIX@$!t#HYJ@ke!S?MxUG;kHeRCx|R5RCX_d z&*SFpy$Onj1O)(Xz{iGpLzU;k!{F=zIzDkMva+%whBwf>9DVn$7Nhnc8jZI5?cu4k zu;zxSUUy}V^h(6wtvK=tI3Z7$-H6@rOX(AYw&-bSMW zwLAC6PoKJRT(uQCaueBa1SZg7jJfkNzGe+vhgx7VekZ@E=oqka?Io^Z0MLmlyEA*g z{bR$^Tnh|M7qqpt^$*Mw(y;K~nK%J62_h{_M9{3Q7XsFox4Zwhsq`GVQi$mh?~zjT z2na$X1Albq?%isDd@w*g3RXa2jMbK@lrXsyk&&D5pK!q<>?K^ggsQ3t#=;v74S7je zgf8m_TwDNHRWpfh1;At?A|jCdvVPTYmcY}pi<2<=$K&L3dX&L+oKuJf;YU0QF^H5C z^=)!?Z_NwbkpBd%DFQ$z@OG>i0??q=e>bBB`kKy~DujUPm)9E8)j)CNJKPgSK|odg z--!5Fq-=gH&|?1ZrGPl7aGK%X(%Y~{A>B;>iD!KWc0C{>xsX`JjPiBC~#-w5cHSxvqRA}{oxctu@JheSy~GzbdYO0Iz&j*3F^0= z(@cX>7U5x~rOMBtsp~G0|7HxEjxO5a`TWCYUEH%gQWqlnyxOLQnNSmy%FIUw_US#z zwFrZxLDwUFjVp7RW~^r38I)U}_Qt5^Q4)g7NyyVt6c&T$(=p%NBhz0M=iIYD@>=6_ z|03v(1Oft93rDR@5uzK=wmOxlB{I8uH}8+3Uz(c(eQYAQifv}lE%+GsK9__PB3VgL zLU?p6Ai#BOd+LD=4?7Bqf}p9%+k;r-@Zf=D#zm}i#o?94S|GSV*$$%iBkUcA#=I}d ze^J@F72=9BDdFa!8-MAL?EXL~c3|?XaekuLGja7c;jLL&y%3&gThqVCyyzLEgsFQh z&JXosZT|!%fKV2Qu5X&56mV*u?Ynkh3mGiuC!jp2r4~R$m>6>DrW;mt;&tH+(L{^y zP{$`jZB%xb`95p2>NWvw2q% zA0iYMEonNEP9RRoNCYeiU`HU7@z{qwX`jA*Pj+PGJ7o7NZg2lk)=N;C2bYPDa0{~D zyVvxog}gG7Ga6pAKATAB)n}E^vn>C!K_*VwV<|8%z&#(0{N*z%e3*bXsltIcbUia; z3B0gKrVf`D7%4@PbqWn`LaiKwe#xSB>*AK!0XAg4;*x9~_$wR%Gktu;H8ZbCPcGds zZ(xskqSp_N>s?ksTE-W{2ney{tOC>g7cCQJD6l*A8rObtcyeb1i0nQz`jPah6O#By0ahD#o0MJE&>n_{2Ag@ z5j+fm?OCIh&3vuDn3X2 zA(1(!d5dX2&TO4vSNq5UON&_vHJ(9qN!#5iKe;HuwS4+rHzwGIeL$Z-Onmya%R3n6!NxvM6?F%Hi z`=1qNKDX35j5Q#n>Lm;Ky=8Y2bk+H(7Faexs#=r?N`JMC<9XEj+78ndDsbfHZWVk@+l;i<%gV-3kb3O4Dy#9HuwA-`Yt+%l@*QJv$)sBh4%^1?d`Ev<3> z?;mU;zLLUg_lF}Er(|WZ0#K%@aqAyrX_%u$;xtqn}zgnJ(6PQVa5Ci>lN%k=)%DcPKYFhhQxXbSekJfM{a(O zHnsMnyBVMgYcL^vGuCnEih0ZvTjV9pjnTX|%HqOiln6W2lm0IF6t_V&TZ z?q9-js1C`TVPjrufbwsD+6AkYLDl-d%cKr&H#J*FU}4ZxgGT{9SElz#e`=rlV~$L6 z^?%IKebcrtCs=xDCofc0HCr|*yKpP3Y(iPeJ|VkY9k zDOchIhW&UEjSP}rBLf?IVs@Y#Ij3;P`S|#5-0w!L2e9o7&v!0M_0@s?wXtU>8{|w+ z!vGaWa4lRwFax6%7kdUPi72^L0XGONAG}hS^!4@OctFm7q{fJsT4d)NsWP>7-vPWMR)UT>>j1YIRF6`70 zAi{|hIk>Fs8ZsDD!}I}}5OVmP;;TYJ2WQ+zD}Uf9N`7R)JmTV!Knp`;{HEr4n3S@S zG1|!U3OI3C?l%L*lw1?`M>nCnf>j~&!JzG8q<0fB~`fbAeSjgoGIs zl6K);f_yx5SY4Jd3^0W;VBf+l*Yj|~34C}X%n2@198|gj4G%-OgV<=`CX!kbv>Is1 zIXO99#mE2pQ5T)j0n^H&qBUy%AGh}?Q7}T|qq?BFjA`3UI> z5X*{nDk^(G52DFLMuIGi+W!4;hajrgyt`YMNb-@9t4v(aCQ`Ow zVslK0G|E^uw> rapD~d>A(Gx3-|vsjC}XcQ{vn9=e(ABFx>=Sa%#)QZQ8k-*0lct-Yqk^ literal 0 HcmV?d00001 diff --git a/book/shared/references.bib b/book/shared/references.bib index 6656795..8259c44 100644 --- a/book/shared/references.bib +++ b/book/shared/references.bib @@ -1,9 +1,55 @@ -@article{achiam_spinning_2018, - title = {Spinning {{Up}} in {{Deep Reinforcement Learning}}}, - author = {Achiam, Joshua}, +@book{vershynin_high-dimensional_2018, + title = {High-{{Dimensional Probability}}: {{An Introduction}} with {{Applications}} in {{Data Science}}}, + shorttitle = {High-{{Dimensional Probability}}}, + author = {Vershynin, Roman}, year = {2018}, - urldate = {2024-07-01}, - file = {/Users/alexandercai/Zotero/storage/UPUMW6XV/index.html} + month = sep, + publisher = {Cambridge University Press}, + abstract = {High-dimensional probability offers insight into the behavior of random vectors, random matrices, random subspaces, and objects used to quantify uncertainty in high dimensions. Drawing on ideas from probability, analysis, and geometry, it lends itself to applications in mathematics, statistics, theoretical computer science, signal processing, optimization, and more. It is the first to integrate theory, key tools, and modern applications of high-dimensional probability. Concentration inequalities form the core, and it covers both classical results such as Hoeffding's and Chernoff's inequalities and modern developments such as the matrix Bernstein's inequality. It then introduces the powerful methods based on stochastic processes, including such tools as Slepian's, Sudakov's, and Dudley's inequalities, as well as generic chaining and bounds based on VC dimension. A broad range of illustrations is embedded throughout, including classical and modern results for covariance estimation, clustering, networks, semidefinite programming, coding, dimension reduction, matrix completion, machine learning, compressed sensing, and sparse regression.}, + googlebooks = {NDdqDwAAQBAJ}, + isbn = {978-1-108-41519-4}, + langid = {english}, + keywords = {Business & Economics / Econometrics,Computers / Optical Data Processing,Language Arts & Disciplines / Library & Information Science / General,Mathematics / Probability & Statistics / General,Technology & Engineering / Signals & Signal Processing}, + file = {/Users/adzcai/Vault/papers/assets/2018/High-Dimensional Probability (2018) - Vershynin.pdf} +} + +@book{kochenderfer_algorithms_2022, + title = {Algorithms for {{Decision Making}}}, + author = {Kochenderfer, Mykel J and Wheeler, Tim A and Wray, Kyle H}, + year = {2022}, + month = aug, + urldate = {2022-10-23}, + abstract = {A broad introduction to algorithms for decision making under uncertainty, introducing the underlying mathematical problem formulations and the algorithms for...}, + isbn = {978-0-262-04701-2}, + langid = {american}, + file = {/Users/adzcai/Vault/papers/assets/2022/Algorithms for Decision Making (2022) - Kochenderfer, Wheeler, Wray.pdf} +} + +@book{sutton_reinforcement_2018, + title = {Reinforcement Learning: An Introduction}, + shorttitle = {Reinforcement Learning}, + author = {Sutton, Richard S. and Barto, Andrew G.}, + year = {2018}, + series = {Adaptive Computation and Machine Learning Series}, + edition = {Second edition}, + publisher = {The MIT Press}, + address = {Cambridge, Massachusetts}, + abstract = {"Reinforcement learning, one of the most active research areas in artificial intelligence, is a computational approach to learning whereby an agent tries to maximize the total amount of reward it receives while interacting with a complex, uncertain environment. In Reinforcement Learning, Richard Sutton and Andrew Barto provide a clear and simple account of the field's key ideas and algorithms."--}, + isbn = {978-0-262-03924-6}, + langid = {english}, + lccn = {Q325.6 .R45 2018}, + keywords = {Reinforcement learning}, + file = {/Users/adzcai/Vault/papers/assets/2018/Reinforcement learning (2018) - Sutton, Barto.pdf} +} + +@book{agarwal_reinforcement_2022, + title = {Reinforcement {{Learning}}: {{Theory}} and {{Algorithms}}}, + shorttitle = {{{AJKS}}}, + author = {Agarwal, Alekh and Jiang, Nan and Kakade, Sham M and Sun, Wen}, + year = {2022}, + month = jan, + langid = {english}, + file = {/Users/adzcai/Vault/papers/assets/2022/Reinforcement Learning (2022) - Agarwal, Jiang, Kakade, Sun.pdf} } @misc{adaptive_agent_team_human-timescale_2023, @@ -20,32 +66,15 @@ @misc{adaptive_agent_team_human-timescale_2023 archiveprefix = {arXiv}, keywords = {Computer Science - Artificial Intelligence,Computer Science - Machine Learning,Computer Science - Neural and Evolutionary Computing}, annotation = {1 citations (Semantic Scholar/arXiv) [2023-02-20]}, - file = {/Users/alexandercai/Library/CloudStorage/GoogleDrive-alexcai@college.harvard.edu/My Drive/Vault/papers/assets/2023/Human-Timescale Adaptation in an Open-Ended Task Space (2023) - Adaptive Agent Team et al.pdf} -} - -@book{agarwal_reinforcement_2022, - title = {Reinforcement {{Learning}}: {{Theory}} and {{Algorithms}}}, - shorttitle = {{{AJKS}}}, - author = {Agarwal, Alekh and Jiang, Nan and Kakade, Sham M and Sun, Wen}, - year = {2022}, - month = jan, - langid = {english}, - file = {/Users/alexandercai/Library/CloudStorage/GoogleDrive-alexcai@college.harvard.edu/My Drive/Vault/papers/assets/2022/Reinforcement Learning (2022) - Agarwal, Jiang, Kakade, Sun.pdf} + file = {/Users/adzcai/Vault/papers/assets/2023/Human-Timescale Adaptation in an Open-Ended Task Space (2023) - Adaptive Agent Team et al.pdf} } -@inproceedings{azar_minimax_2017, - title = {Minimax {{Regret Bounds}} for {{Reinforcement Learning}}}, - booktitle = {Proceedings of the 34th {{International Conference}} on {{Machine Learning}}}, - author = {Azar, Mohammad Gheshlaghi and Osband, Ian and Munos, R{\'e}mi}, - year = {2017}, - month = jul, - pages = {263--272}, - publisher = {PMLR}, - issn = {2640-3498}, - urldate = {2024-06-21}, - abstract = {We consider the problem of provably optimal exploration in reinforcement learning for finite horizon MDPs. We show that an optimistic modification to value iteration achieves a regret bound of \${\textbackslash}tilde \{O\}( {\textbackslash}sqrt\{HSAT\} + H{\textasciicircum}2S{\textasciicircum}2A+H{\textbackslash}sqrt\{T\})\$ where \$H\$ is the time horizon, \$S\$ the number of states, \$A\$ the number of actions and \$T\$ the number of time-steps. This result improves over the best previous known bound \${\textbackslash}tilde \{O\}(HS {\textbackslash}sqrt\{AT\})\$ achieved by the UCRL2 algorithm. The key significance of our new results is that when \$T{\textbackslash}geq H{\textasciicircum}3S{\textasciicircum}3A\$ and \$SA{\textbackslash}geq H\$, it leads to a regret of \${\textbackslash}tilde\{O\}({\textbackslash}sqrt\{HSAT\})\$ that matches the established lower bound of \${\textbackslash}Omega({\textbackslash}sqrt\{HSAT\})\$ up to a logarithmic factor. Our analysis contain two key insights. We use careful application of concentration inequalities to the optimal value function as a whole, rather than to the transitions probabilities (to improve scaling in \$S\$), and we define Bernstein-based ``exploration bonuses'' that use the empirical variance of the estimated values at the next states (to improve scaling in \$H\$).}, - langid = {english}, - file = {/Users/alexandercai/Library/CloudStorage/GoogleDrive-alexcai@college.harvard.edu/My Drive/Vault/papers/assets/2017/Minimax Regret Bounds for Reinforcement Learning (2017) - Azar, Osband, Munos.pdf} +@book{boyd_convex_2004, + title = {Convex {{Optimization}}}, + author = {Boyd, Stephen and Vandenberghe, Lieven}, + year = {2004}, + publisher = {Cambridge University Press}, + file = {/Users/adzcai/Vault/papers/assets/2004/Convex Optimization (2004) - Boyd, Vandenberghe.pdf;/Users/adzcai/Vault/papers/assets/2004/Slides - Convex Optimization (2004) - Boyd, Vandenberghe.pdf} } @misc{babuschkin_deepmind_2020, @@ -54,21 +83,16 @@ @misc{babuschkin_deepmind_2020 year = {2020} } -@article{barto_neuronlike_1983, - title = {Neuronlike Adaptive Elements That Can Solve Difficult Learning Control Problems}, - author = {Barto, Andrew G. and Sutton, Richard S. and Anderson, Charles W.}, - year = {1983}, - month = sep, - journal = {IEEE Transactions on Systems, Man, and Cybernetics}, - volume = {SMC-13}, - number = {5}, - pages = {834--846}, - issn = {2168-2909}, - doi = {10.1109/TSMC.1983.6313077}, - urldate = {2024-07-01}, - abstract = {It is shown how a system consisting of two neuronlike adaptive elements can solve a difficult learning control problem. The task is to balance a pole that is hinged to a movable cart by applying forces to the cart's base. It is argued that the learning problems faced by adaptive elements that are components of adaptive networks are at least as difficult as this version of the pole-balancing problem. The learning system consists of a single associative search element (ASE) and a single adaptive critic element (ACE). In the course of learning to balance the pole, the ASE constructs associations between input and output by searching under the influence of reinforcement feedback, and the ACE constructs a more informative evaluation function than reinforcement feedback alone can provide. The differences between this approach and other attempts to solve problems using neurolike elements are discussed, as is the relation of this work to classical and instrumental conditioning in animal learning studies and its possible implications for research in the neurosciences.}, - keywords = {Adaptive systems,Biological neural networks,Neurons,Pattern recognition,Problem-solving,Supervised learning,Training}, - file = {/Users/alexandercai/Zotero/storage/GHD9WZXL/6313077.html} +@book{sussman_functional_2013, + title = {Functional Differential Geometry}, + author = {Sussman, Gerald Jay and Wisdom, Jack and Farr, Will}, + year = {2013}, + publisher = {The MIT Press}, + address = {Cambridge, MA}, + isbn = {978-0-262-01934-7}, + lccn = {QC20.7.D52 S87 2013}, + keywords = {Functional differential equations,Geometry Differential,Mathematical physics}, + file = {/Users/adzcai/Vault/papers/assets/2013/Functional differential geometry (2013) - Sussman, Wisdom, Farr.pdf} } @article{degrave_magnetic_2022, @@ -89,7 +113,25 @@ @article{degrave_magnetic_2022 langid = {english}, keywords = {Computer science,Magnetically confined plasmas,Nuclear fusion and fission}, annotation = {230 citations (Semantic Scholar/DOI) [2023-05-21]}, - file = {/Users/alexandercai/Library/CloudStorage/GoogleDrive-alexcai@college.harvard.edu/My Drive/Vault/papers/assets/2022/Magnetic control of tokamak plasmas through deep reinforcement learning (2022) - Degrave et al.pdf} + file = {/Users/adzcai/Vault/papers/assets/2022/Magnetic control of tokamak plasmas through deep reinforcement learning (2022) - Degrave et al.pdf} +} + +@misc{hausknecht_deep_2017, + title = {Deep {{Recurrent Q-Learning}} for {{Partially Observable MDPs}}}, + author = {Hausknecht, Matthew and Stone, Peter}, + year = {2017}, + month = jan, + number = {arXiv:1507.06527}, + eprint = {1507.06527}, + primaryclass = {cs}, + publisher = {arXiv}, + doi = {10.48550/arXiv.1507.06527}, + urldate = {2023-06-04}, + abstract = {Deep Reinforcement Learning has yielded proficient controllers for complex tasks. However, these controllers have limited memory and rely on being able to perceive the complete game screen at each decision point. To address these shortcomings, this article investigates the effects of adding recurrency to a Deep Q-Network (DQN) by replacing the first post-convolutional fully-connected layer with a recurrent LSTM. The resulting {\textbackslash}textit\{Deep Recurrent Q-Network\} (DRQN), although capable of seeing only a single frame at each timestep, successfully integrates information through time and replicates DQN's performance on standard Atari games and partially observed equivalents featuring flickering game screens. Additionally, when trained with partial observations and evaluated with incrementally more complete observations, DRQN's performance scales as a function of observability. Conversely, when trained with full observations and evaluated with partial observations, DRQN's performance degrades less than DQN's. Thus, given the same length of history, recurrency is a viable alternative to stacking a history of frames in the DQN's input layer and while recurrency confers no systematic advantage when learning to play the game, the recurrent net can better adapt at evaluation time if the quality of observations changes.}, + archiveprefix = {arXiv}, + keywords = {Computer Science - Machine Learning}, + annotation = {1274 citations (Semantic Scholar/arXiv) [2023-06-04]}, + file = {/Users/adzcai/Vault/papers/assets/2017/Deep Recurrent Q-Learning for Partially Observable MDPs (2017) - Hausknecht, Stone.pdf} } @inproceedings{freeman_brax_2021, @@ -107,37 +149,24 @@ @inproceedings{freeman_brax_2021 pubstate = {preprint {\textbar} DBLP: https://dblp.org/rec/conf/nips/FreemanFRGMB21}, keywords = {Computer Science - Artificial Intelligence,Computer Science - Robotics}, annotation = {151 citations (Semantic Scholar/arXiv) [2023-07-22]}, - file = {/Users/alexandercai/Library/CloudStorage/GoogleDrive-alexcai@college.harvard.edu/My Drive/Vault/papers/assets/2021/Brax – A Differentiable Physics Engine for Large Scale Rigid Body Simulation (2021) - Freeman et al.pdf} + file = {/Users/adzcai/Vault/papers/assets/2021/Brax – A Differentiable Physics Engine for Large Scale Rigid Body Simulation (2021) - Freeman et al.pdf} } -@misc{hausknecht_deep_2017, - title = {Deep {{Recurrent Q-Learning}} for {{Partially Observable MDPs}}}, - author = {Hausknecht, Matthew and Stone, Peter}, - year = {2017}, - month = jan, - number = {arXiv:1507.06527}, - eprint = {1507.06527}, +@inproceedings{schulman_high-dimensional_2018, + title = {High-{{Dimensional Continuous Control Using Generalized Advantage Estimation}}}, + booktitle = {{{ICLR}} 2016}, + author = {Schulman, John and Moritz, Philipp and Levine, Sergey and Jordan, Michael and Abbeel, Pieter}, + year = {2018}, + month = oct, + eprint = {1506.02438}, primaryclass = {cs}, - publisher = {arXiv}, - doi = {10.48550/arXiv.1507.06527}, - urldate = {2023-06-04}, - abstract = {Deep Reinforcement Learning has yielded proficient controllers for complex tasks. However, these controllers have limited memory and rely on being able to perceive the complete game screen at each decision point. To address these shortcomings, this article investigates the effects of adding recurrency to a Deep Q-Network (DQN) by replacing the first post-convolutional fully-connected layer with a recurrent LSTM. The resulting {\textbackslash}textit\{Deep Recurrent Q-Network\} (DRQN), although capable of seeing only a single frame at each timestep, successfully integrates information through time and replicates DQN's performance on standard Atari games and partially observed equivalents featuring flickering game screens. Additionally, when trained with partial observations and evaluated with incrementally more complete observations, DRQN's performance scales as a function of observability. Conversely, when trained with full observations and evaluated with partial observations, DRQN's performance degrades less than DQN's. Thus, given the same length of history, recurrency is a viable alternative to stacking a history of frames in the DQN's input layer and while recurrency confers no systematic advantage when learning to play the game, the recurrent net can better adapt at evaluation time if the quality of observations changes.}, + urldate = {2023-06-21}, + abstract = {Policy gradient methods are an appealing approach in reinforcement learning because they directly optimize the cumulative reward and can straightforwardly be used with nonlinear function approximators such as neural networks. The two main challenges are the large number of samples typically required, and the difficulty of obtaining stable and steady improvement despite the nonstationarity of the incoming data. We address the first challenge by using value functions to substantially reduce the variance of policy gradient estimates at the cost of some bias, with an exponentially-weighted estimator of the advantage function that is analogous to TD(lambda). We address the second challenge by using trust region optimization procedure for both the policy and the value function, which are represented by neural networks. Our approach yields strong empirical results on highly challenging 3D locomotion tasks, learning running gaits for bipedal and quadrupedal simulated robots, and learning a policy for getting the biped to stand up from starting out lying on the ground. In contrast to a body of prior work that uses hand-crafted policy representations, our neural network policies map directly from raw kinematics to joint torques. Our algorithm is fully model-free, and the amount of simulated experience required for the learning tasks on 3D bipeds corresponds to 1-2 weeks of real time.}, archiveprefix = {arXiv}, - keywords = {Computer Science - Machine Learning}, - annotation = {1274 citations (Semantic Scholar/arXiv) [2023-06-04]}, - file = {/Users/alexandercai/Library/CloudStorage/GoogleDrive-alexcai@college.harvard.edu/My Drive/Vault/papers/assets/2017/Deep Recurrent Q-Learning for Partially Observable MDPs (2017) - Hausknecht, Stone.pdf} -} - -@book{kochenderfer_algorithms_2022, - title = {Algorithms for {{Decision Making}}}, - author = {Kochenderfer, Mykel J and Wheeler, Tim A and Wray, Kyle H}, - year = {2022}, - month = aug, - urldate = {2022-10-23}, - abstract = {A broad introduction to algorithms for decision making under uncertainty, introducing the underlying mathematical problem formulations and the algorithms for...}, - isbn = {978-0-262-04701-2}, - langid = {american}, - file = {/Users/alexandercai/Library/CloudStorage/GoogleDrive-alexcai@college.harvard.edu/My Drive/Vault/papers/assets/2022/Algorithms for Decision Making (2022) - Kochenderfer, Wheeler, Wray.pdf} + pubstate = {preprint}, + keywords = {Computer Science - Machine Learning,Computer Science - Robotics,Electrical Engineering and Systems Science - Systems and Control}, + annotation = {2253 citations (Semantic Scholar/arXiv) [2023-07-22]}, + file = {/Users/adzcai/Vault/papers/assets/2018/High-Dimensional Continuous Control Using Generalized Advantage Estimation (2018) - Schulman et al2.pdf} } @article{lai_asymptotically_1985, @@ -152,7 +181,7 @@ @article{lai_asymptotically_1985 issn = {0196-8858}, doi = {10.1016/0196-8858(85)90002-8}, urldate = {2023-10-23}, - file = {/Users/alexandercai/Library/CloudStorage/GoogleDrive-alexcai@college.harvard.edu/My Drive/Vault/papers/assets/1985/Asymptotically efficient adaptive allocation rules (1985) - Lai, Robbins.pdf} + file = {/Users/adzcai/Vault/papers/assets/1985/Asymptotically efficient adaptive allocation rules (1985) - Lai, Robbins.pdf} } @inproceedings{lechner_gigastep_2023, @@ -164,19 +193,7 @@ @inproceedings{lechner_gigastep_2023 urldate = {2023-12-12}, abstract = {Multi-agent reinforcement learning (MARL) research is faced with a trade-off: it either uses complex environments requiring large compute resources, which makes it inaccessible to researchers with limited resources, or relies on simpler dynamics for faster execution, which makes the transferability of the results to more realistic tasks challenging. Motivated by these challenges, we present Gigastep, a fully vectorizable, MARL environment implemented in JAX, capable of executing up to one billion environment steps per second on consumer-grade hardware. Its design allows for comprehensive MARL experimentation, including a complex, high-dimensional space defined by 3D dynamics, stochasticity, and partial observations. Gigastep supports both collaborative and adversarial tasks, continuous and discrete action spaces, and provides RGB image and feature vector observations, allowing the evaluation of a wide range of MARL algorithms. We validate Gigastep's usability through an extensive set of experiments, underscoring its role in widening participation and promoting inclusivity in the MARL research community.}, langid = {english}, - file = {/Users/alexandercai/Library/CloudStorage/GoogleDrive-alexcai@college.harvard.edu/My Drive/Vault/papers/assets/2023/Gigastep - One Billion Steps per Second Multi-agent Reinforcement Learning (2023) - Lechner et al.pdf} -} - -@article{mnih_playing_2013, - title = {Playing {{Atari}} with {{Deep Reinforcement Learning}}}, - author = {Mnih, Volodymyr and Kavukcuoglu, Koray and Silver, David and Graves, Alex and Antonoglou, Ioannis and Wierstra, Daan and Riedmiller, Martin A.}, - year = {2013}, - journal = {CoRR}, - volume = {abs/1312.5602}, - eprint = {1312.5602}, - urldate = {2024-06-21}, - archiveprefix = {arXiv}, - file = {/Users/alexandercai/Library/CloudStorage/GoogleDrive-alexcai@college.harvard.edu/My Drive/Vault/papers/assets/2013/Playing Atari with Deep Reinforcement Learning (2013) - Mnih et al.pdf} + file = {/Users/adzcai/Vault/papers/assets/2023/Gigastep - One Billion Steps per Second Multi-agent Reinforcement Learning (2023) - Lechner et al.pdf} } @book{nielsen_neural_2015, @@ -187,15 +204,31 @@ @book{nielsen_neural_2015 urldate = {2024-03-10} } -@inproceedings{ross_reduction_2010, - title = {A {{Reduction}} of {{Imitation Learning}} and {{Structured Prediction}} to {{No-Regret Online Learning}}}, - booktitle = {International {{Conference}} on {{Artificial Intelligence}} and {{Statistics}}}, - author = {Ross, St{\'e}phane and Gordon, Geoffrey J. and Bagnell, J.}, - year = {2010}, - month = nov, - urldate = {2024-08-08}, - abstract = {Sequential prediction problems such as imitation learning, where future observations depend on previous predictions (actions), violate the common i.i.d. assumptions made in statistical learning. This leads to poor performance in theory and often in practice. Some recent approaches provide stronger guarantees in this setting, but remain somewhat unsatisfactory as they train either non-stationary or stochastic policies and require a large number of iterations. In this paper, we propose a new iterative algorithm, which trains a stationary deterministic policy, that can be seen as a no regret algorithm in an online learning setting. We show that any such no regret algorithm, combined with additional reduction assumptions, must find a policy with good performance under the distribution of observations it induces in such sequential settings. We demonstrate that this new approach outperforms previous approaches on two challenging imitation learning problems and a benchmark sequence labeling problem.}, - file = {/Users/alexandercai/Library/CloudStorage/GoogleDrive-alexcai@college.harvard.edu/My Drive/Vault/papers/assets/2010/A Reduction of Imitation Learning and Structured Prediction to No-Regret Online (2010) - Ross, Gordon, Bagnell.pdf} +@inproceedings{azar_minimax_2017, + title = {Minimax {{Regret Bounds}} for {{Reinforcement Learning}}}, + booktitle = {Proceedings of the 34th {{International Conference}} on {{Machine Learning}}}, + author = {Azar, Mohammad Gheshlaghi and Osband, Ian and Munos, R{\'e}mi}, + year = {2017}, + month = jul, + pages = {263--272}, + publisher = {PMLR}, + issn = {2640-3498}, + urldate = {2024-06-21}, + abstract = {We consider the problem of provably optimal exploration in reinforcement learning for finite horizon MDPs. We show that an optimistic modification to value iteration achieves a regret bound of \${\textbackslash}tilde \{O\}( {\textbackslash}sqrt\{HSAT\} + H{\textasciicircum}2S{\textasciicircum}2A+H{\textbackslash}sqrt\{T\})\$ where \$H\$ is the time horizon, \$S\$ the number of states, \$A\$ the number of actions and \$T\$ the number of time-steps. This result improves over the best previous known bound \${\textbackslash}tilde \{O\}(HS {\textbackslash}sqrt\{AT\})\$ achieved by the UCRL2 algorithm. The key significance of our new results is that when \$T{\textbackslash}geq H{\textasciicircum}3S{\textasciicircum}3A\$ and \$SA{\textbackslash}geq H\$, it leads to a regret of \${\textbackslash}tilde\{O\}({\textbackslash}sqrt\{HSAT\})\$ that matches the established lower bound of \${\textbackslash}Omega({\textbackslash}sqrt\{HSAT\})\$ up to a logarithmic factor. Our analysis contain two key insights. We use careful application of concentration inequalities to the optimal value function as a whole, rather than to the transitions probabilities (to improve scaling in \$S\$), and we define Bernstein-based ``exploration bonuses'' that use the empirical variance of the estimated values at the next states (to improve scaling in \$H\$).}, + langid = {english}, + file = {/Users/adzcai/Vault/papers/assets/2017/Minimax Regret Bounds for Reinforcement Learning (2017) - Azar, Osband, Munos.pdf} +} + +@article{mnih_playing_2013-1, + title = {Playing {{Atari}} with {{Deep Reinforcement Learning}}}, + author = {Mnih, Volodymyr and Kavukcuoglu, Koray and Silver, David and Graves, Alex and Antonoglou, Ioannis and Wierstra, Daan and Riedmiller, Martin A.}, + year = {2013}, + journal = {CoRR}, + volume = {abs/1312.5602}, + eprint = {1312.5602}, + urldate = {2024-06-21}, + archiveprefix = {arXiv}, + file = {/Users/adzcai/Vault/papers/assets/2013/Playing Atari with Deep Reinforcement Learning (2013) - Mnih et al.pdf} } @misc{sun_easy--hard_2024, @@ -213,51 +246,7 @@ @misc{sun_easy--hard_2024 abstract = {Current AI alignment methodologies rely on human-provided demonstrations or judgments, and the learned capabilities of AI systems would be upper-bounded by human capabilities as a result. This raises a challenging research question: How can we keep improving the systems when their capabilities have surpassed the levels of humans? This paper answers this question in the context of tackling hard reasoning tasks (e.g., level 4-5 MATH problems) via learning from human annotations on easier tasks (e.g., level 1-3 MATH problems), which we term as {\textbackslash}textit\{easy-to-hard generalization\}. Our key insight is that an evaluator (reward model) trained on supervisions for easier tasks can be effectively used for scoring candidate solutions of harder tasks and hence facilitating easy-to-hard generalization over different levels of tasks. Based on this insight, we propose a novel approach to scalable alignment, which firstly trains the process-supervised reward models on easy problems (e.g., level 1-3), and then uses them to evaluate the performance of policy models on hard problems. We show that such {\textbackslash}textit\{easy-to-hard generalization from evaluators\} can enable {\textbackslash}textit\{easy-to-hard generalizations in generators\} either through re-ranking or reinforcement learning (RL). Notably, our process-supervised 7b RL model achieves an accuracy of 34.0{\textbackslash}\% on MATH500, despite only using human supervision on easy problems. Our approach suggests a promising path toward AI systems that advance beyond the frontier of human supervision.}, archiveprefix = {arXiv}, keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Machine Learning}, - file = {/Users/alexandercai/Library/CloudStorage/GoogleDrive-alexcai@college.harvard.edu/My Drive/Vault/papers/assets/2024/Easy-to-Hard Generalization (2024) - Sun et al.pdf;/Users/alexandercai/Zotero/storage/J52D59AK/2403.html} -} - -@book{sussman_functional_2013, - title = {Functional Differential Geometry}, - author = {Sussman, Gerald Jay and Wisdom, Jack and Farr, Will}, - year = {2013}, - publisher = {The MIT Press}, - address = {Cambridge, MA}, - isbn = {978-0-262-01934-7}, - lccn = {QC20.7.D52 S87 2013}, - keywords = {Functional differential equations,Geometry Differential,Mathematical physics}, - file = {/Users/alexandercai/Library/CloudStorage/GoogleDrive-alexcai@college.harvard.edu/My Drive/Vault/papers/assets/2013/Functional differential geometry (2013) - Sussman, Wisdom, Farr.pdf} -} - -@book{sutton_reinforcement_2018, - title = {Reinforcement Learning: An Introduction}, - shorttitle = {Reinforcement Learning}, - author = {Sutton, Richard S. and Barto, Andrew G.}, - year = {2018}, - series = {Adaptive Computation and Machine Learning Series}, - edition = {Second edition}, - publisher = {The MIT Press}, - address = {Cambridge, Massachusetts}, - abstract = {"Reinforcement learning, one of the most active research areas in artificial intelligence, is a computational approach to learning whereby an agent tries to maximize the total amount of reward it receives while interacting with a complex, uncertain environment. In Reinforcement Learning, Richard Sutton and Andrew Barto provide a clear and simple account of the field's key ideas and algorithms."--}, - isbn = {978-0-262-03924-6}, - langid = {english}, - lccn = {Q325.6 .R45 2018}, - keywords = {Reinforcement learning}, - file = {/Users/alexandercai/Library/CloudStorage/GoogleDrive-alexcai@college.harvard.edu/My Drive/Vault/papers/assets/2018/Reinforcement learning (2018) - Sutton, Barto.pdf} -} - -@book{vershynin_high-dimensional_2018, - title = {High-{{Dimensional Probability}}: {{An Introduction}} with {{Applications}} in {{Data Science}}}, - shorttitle = {High-{{Dimensional Probability}}}, - author = {Vershynin, Roman}, - year = {2018}, - month = sep, - publisher = {Cambridge University Press}, - abstract = {High-dimensional probability offers insight into the behavior of random vectors, random matrices, random subspaces, and objects used to quantify uncertainty in high dimensions. Drawing on ideas from probability, analysis, and geometry, it lends itself to applications in mathematics, statistics, theoretical computer science, signal processing, optimization, and more. It is the first to integrate theory, key tools, and modern applications of high-dimensional probability. Concentration inequalities form the core, and it covers both classical results such as Hoeffding's and Chernoff's inequalities and modern developments such as the matrix Bernstein's inequality. It then introduces the powerful methods based on stochastic processes, including such tools as Slepian's, Sudakov's, and Dudley's inequalities, as well as generic chaining and bounds based on VC dimension. A broad range of illustrations is embedded throughout, including classical and modern results for covariance estimation, clustering, networks, semidefinite programming, coding, dimension reduction, matrix completion, machine learning, compressed sensing, and sparse regression.}, - googlebooks = {NDdqDwAAQBAJ}, - isbn = {978-1-108-41519-4}, - langid = {english}, - keywords = {Business & Economics / Econometrics,Computers / Optical Data Processing,Language Arts & Disciplines / Library & Information Science / General,Mathematics / Probability & Statistics / General,Technology & Engineering / Signals & Signal Processing}, - file = {/Users/alexandercai/Library/CloudStorage/GoogleDrive-alexcai@college.harvard.edu/My Drive/Vault/papers/assets/2018/High-Dimensional Probability (2018) - Vershynin.pdf} + file = {/Users/adzcai/Vault/papers/assets/2024/Easy-to-Hard Generalization (2024) - Sun et al.pdf;/Users/adzcai/Zotero/storage/J52D59AK/2403.html} } @misc{welleck_decoding_2024, @@ -275,7 +264,7 @@ @misc{welleck_decoding_2024 abstract = {One of the most striking findings in modern research on large language models (LLMs) is that scaling up compute during training leads to better results. However, less attention has been given to the benefits of scaling compute during inference. This survey focuses on these inference-time approaches. We explore three areas under a unified mathematical formalism: token-level generation algorithms, meta-generation algorithms, and efficient generation. Token-level generation algorithms, often called decoding algorithms, operate by sampling a single token at a time or constructing a token-level search space and then selecting an output. These methods typically assume access to a language model's logits, next-token distributions, or probability scores. Meta-generation algorithms work on partial or full sequences, incorporating domain knowledge, enabling backtracking, and integrating external information. Efficient generation methods aim to reduce token costs and improve the speed of generation. Our survey unifies perspectives from three research communities: traditional natural language processing, modern LLMs, and machine learning systems.}, archiveprefix = {arXiv}, keywords = {Computer Science - Computation and Language,Computer Science - Machine Learning}, - file = {/Users/alexandercai/Library/CloudStorage/GoogleDrive-alexcai@college.harvard.edu/My Drive/Vault/papers/assets/2024/From Decoding to Meta-Generation (2024) - Welleck et al.pdf;/Users/alexandercai/Zotero/storage/S4Y984R4/2406.html} + file = {/Users/adzcai/Vault/papers/assets/2024/From Decoding to Meta-Generation (2024) - Welleck et al.pdf;/Users/adzcai/Zotero/storage/S4Y984R4/2406.html} } @misc{zhai_fine-tuning_2024, @@ -292,7 +281,7 @@ @misc{zhai_fine-tuning_2024 abstract = {Large vision-language models (VLMs) fine-tuned on specialized visual instruction-following data have exhibited impressive language reasoning capabilities across various scenarios. However, this fine-tuning paradigm may not be able to efficiently learn optimal decision-making agents in multi-step goal-directed tasks from interactive environments. To address this challenge, we propose an algorithmic framework that fine-tunes VLMs with reinforcement learning (RL). Specifically, our framework provides a task description and then prompts the VLM to generate chain-of-thought (CoT) reasoning, enabling the VLM to efficiently explore intermediate reasoning steps that lead to the final text-based action. Next, the open-ended text output is parsed into an executable action to interact with the environment to obtain goal-directed task rewards. Finally, our framework uses these task rewards to fine-tune the entire VLM with RL. Empirically, we demonstrate that our proposed framework enhances the decision-making capabilities of VLM agents across various tasks, enabling 7b models to outperform commercial models such as GPT4-V or Gemini. Furthermore, we find that CoT reasoning is a crucial component for performance improvement, as removing the CoT reasoning results in a significant decrease in the overall performance of our method.}, archiveprefix = {arXiv}, keywords = {Computer Science - Artificial Intelligence,Computer Science - Computation and Language,Computer Science - Computer Vision and Pattern Recognition,Computer Science - Machine Learning}, - file = {/Users/alexandercai/Library/CloudStorage/GoogleDrive-alexcai@college.harvard.edu/My Drive/Vault/papers/assets/2024/Fine-Tuning Large Vision-Language Models as Decision-Making Agents via (2024) - Zhai et al.pdf;/Users/alexandercai/Zotero/storage/2X2WJU4D/2405.html} + file = {/Users/adzcai/Vault/papers/assets/2024/Fine-Tuning Large Vision-Language Models as Decision-Making Agents via (2024) - Zhai et al.pdf;/Users/adzcai/Zotero/storage/2X2WJU4D/2405.html} } @misc{zhang_adaptable_2024, @@ -309,7 +298,15 @@ @misc{zhang_adaptable_2024 abstract = {Despite the success of Large Language Models (LLMs) on various tasks following human instructions, controlling model generation at inference time poses a persistent challenge. In this paper, we introduce Ctrl-G, an adaptable framework that facilitates tractable and flexible control of LLM generation to reliably follow logical constraints. Ctrl-G combines any production-ready LLM with a Hidden Markov Model, enabling LLM outputs to adhere to logical constraints represented as deterministic finite automata. We show that Ctrl-G, when applied to a TULU2-7B model, outperforms GPT3.5 and GPT4 on the task of interactive text editing: specifically, for the task of generating text insertions/continuations following logical constraints, Ctrl-G achieves over 30\% higher satisfaction rate in human evaluation compared to GPT4. When applied to medium-size language models (e.g., GPT2-large), Ctrl-G also beats its counterparts for constrained generation by large margins on standard benchmarks. Additionally, as a proof-of-concept study, we experiment Ctrl-G on the Grade School Math benchmark to assist LLM reasoning, foreshadowing the application of Ctrl-G, as well as other constrained generation approaches, beyond traditional language generation tasks.}, archiveprefix = {arXiv}, keywords = {Computer Science - Computation and Language}, - file = {/Users/alexandercai/Library/CloudStorage/GoogleDrive-alexcai@college.harvard.edu/My Drive/Vault/papers/assets/2024/Adaptable Logical Control for Large Language Models (2024) - Zhang, Kung, Yoshida, Broeck, Peng.pdf;/Users/alexandercai/Zotero/storage/38W8T74Y/2406.html} + file = {/Users/adzcai/Vault/papers/assets/2024/Adaptable Logical Control for Large Language Models (2024) - Zhang, Kung, Yoshida, Broeck, Peng.pdf;/Users/adzcai/Zotero/storage/38W8T74Y/2406.html} +} + +@article{achiam_spinning_2018, + title = {Spinning {{Up}} in {{Deep Reinforcement Learning}}}, + author = {Achiam, Joshua}, + year = {2018}, + urldate = {2024-07-01}, + file = {/Users/adzcai/Zotero/storage/UPUMW6XV/index.html} } @misc{zhang_deep_2015, @@ -326,5 +323,49 @@ @misc{zhang_deep_2015 abstract = {We study the problem of stochastic optimization for deep learning in the parallel computing environment under communication constraints. A new algorithm is proposed in this setting where the communication and coordination of work among concurrent processes (local workers), is based on an elastic force which links the parameters they compute with a center variable stored by the parameter server (master). The algorithm enables the local workers to perform more exploration, i.e. the algorithm allows the local variables to fluctuate further from the center variable by reducing the amount of communication between local workers and the master. We empirically demonstrate that in the deep learning setting, due to the existence of many local optima, allowing more exploration can lead to the improved performance. We propose synchronous and asynchronous variants of the new algorithm. We provide the stability analysis of the asynchronous variant in the round-robin scheme and compare it with the more common parallelized method ADMM. We show that the stability of EASGD is guaranteed when a simple stability condition is satisfied, which is not the case for ADMM. We additionally propose the momentum-based version of our algorithm that can be applied in both synchronous and asynchronous settings. Asynchronous variant of the algorithm is applied to train convolutional neural networks for image classification on the CIFAR and ImageNet datasets. Experiments demonstrate that the new algorithm accelerates the training of deep architectures compared to DOWNPOUR and other common baseline approaches and furthermore is very communication efficient.}, archiveprefix = {arXiv}, keywords = {Computer Science - Machine Learning,Statistics - Machine Learning}, - file = {/Users/alexandercai/Library/CloudStorage/GoogleDrive-alexcai@college.harvard.edu/My Drive/Vault/papers/assets/2015/Deep learning with Elastic Averaging SGD (2015) - Zhang, Choromanska, LeCun.pdf;/Users/alexandercai/Zotero/storage/M4LFKVWK/1412.html} + file = {/Users/adzcai/Vault/papers/assets/2015/Deep learning with Elastic Averaging SGD (2015) - Zhang, Choromanska, LeCun.pdf;/Users/adzcai/Zotero/storage/M4LFKVWK/1412.html} +} + +@article{barto_neuronlike_1983, + title = {Neuronlike Adaptive Elements That Can Solve Difficult Learning Control Problems}, + author = {Barto, Andrew G. and Sutton, Richard S. and Anderson, Charles W.}, + year = {1983}, + month = sep, + journal = {IEEE Transactions on Systems, Man, and Cybernetics}, + volume = {SMC-13}, + number = {5}, + pages = {834--846}, + issn = {2168-2909}, + doi = {10.1109/TSMC.1983.6313077}, + urldate = {2024-07-01}, + abstract = {It is shown how a system consisting of two neuronlike adaptive elements can solve a difficult learning control problem. The task is to balance a pole that is hinged to a movable cart by applying forces to the cart's base. It is argued that the learning problems faced by adaptive elements that are components of adaptive networks are at least as difficult as this version of the pole-balancing problem. The learning system consists of a single associative search element (ASE) and a single adaptive critic element (ACE). In the course of learning to balance the pole, the ASE constructs associations between input and output by searching under the influence of reinforcement feedback, and the ACE constructs a more informative evaluation function than reinforcement feedback alone can provide. The differences between this approach and other attempts to solve problems using neurolike elements are discussed, as is the relation of this work to classical and instrumental conditioning in animal learning studies and its possible implications for research in the neurosciences.}, + keywords = {Adaptive systems,Biological neural networks,Neurons,Pattern recognition,Problem-solving,Supervised learning,Training}, + file = {/Users/adzcai/Zotero/storage/GHD9WZXL/6313077.html} +} + +@inproceedings{ross_reduction_2010, + title = {A {{Reduction}} of {{Imitation Learning}} and {{Structured Prediction}} to {{No-Regret Online Learning}}}, + booktitle = {International {{Conference}} on {{Artificial Intelligence}} and {{Statistics}}}, + author = {Ross, St{\'e}phane and Gordon, Geoffrey J. and Bagnell, J.}, + year = {2010}, + month = nov, + urldate = {2024-08-08}, + abstract = {Sequential prediction problems such as imitation learning, where future observations depend on previous predictions (actions), violate the common i.i.d. assumptions made in statistical learning. This leads to poor performance in theory and often in practice. Some recent approaches provide stronger guarantees in this setting, but remain somewhat unsatisfactory as they train either non-stationary or stochastic policies and require a large number of iterations. In this paper, we propose a new iterative algorithm, which trains a stationary deterministic policy, that can be seen as a no regret algorithm in an online learning setting. We show that any such no regret algorithm, combined with additional reduction assumptions, must find a policy with good performance under the distribution of observations it induces in such sequential settings. We demonstrate that this new approach outperforms previous approaches on two challenging imitation learning problems and a benchmark sequence labeling problem.}, + file = {/Users/adzcai/Vault/papers/assets/2010/A Reduction of Imitation Learning and Structured Prediction to No-Regret Online (2010) - Ross, Gordon, Bagnell.pdf} +} + +@book{heath_scientific_2018, + title = {Scientific Computing: An Introductory Survey}, + shorttitle = {Scientific Computing}, + author = {Heath, Michael T.}, + year = {2018}, + series = {Classics in Applied Mathematics}, + edition = {Revised second edition, SIAM edition}, + number = {80}, + publisher = {{Society for Industrial and Applied Mathematics}}, + address = {Philadelphia}, + isbn = {978-1-61197-557-4}, + lccn = {Q183.9 .H4 2018}, + keywords = {Data processing,Numerical analysis,Science}, + file = {/Users/adzcai/Vault/papers/assets/2018/Scientific computing (2018) - Heath - Chapter 1.pdf;/Users/adzcai/Vault/papers/assets/2018/Scientific computing (2018) - Heath - Chapter 2.pdf;/Users/adzcai/Vault/papers/assets/2018/Scientific computing (2018) - Heath - Chapter 3.pdf;/Users/adzcai/Vault/papers/assets/2018/Scientific computing (2018) - Heath - Frontmatter.pdf} } diff --git a/book/shared/trajectory.png b/book/shared/trajectory.png new file mode 100644 index 0000000000000000000000000000000000000000..a4529ea7954d8d267273705d42ace91a2be226af GIT binary patch literal 36453 zcmd42c{r3`+y{IQSqd?hY#}3ALydj^)hH6OXWuf&o_(FE#GpuveW|G^YxaF9p%{d$ zV~gy2*0GHD=y{$$-ap^}-mWf}3v=J+KKD7_?`J#FcXV$uoZ&hHK@fwMCdvSUD2^Zq zCPGUMegZwr_zXc8AT88?MuF+eV*#Pu+a#LxZXcl^i7Vf9(>3g`X3Tu6@HGv;^G2P5 zRoLC-t;nm_C3F$>Z`I!rq$ned{fG0G&CXD@3y*5~Gr#B2Gj|j`sKRrCqpWW|{ zzPAq8ITA3bPPu+^xwOAkJnFPPDn_rn&eu9+k%tv>T{%V7%!ph+V#aCIW5U7TAf9OU zNTR4Y^#{0vs8)|k$*UdPD+#j1@=ZGMTf9#U5%#}>ix$*q{GVZR>i+vE81mJ0)}#L! z`8|1&{U`r^{oTu8>*TZ7XcKwgj$Y{h_ZOV=R>CqN z6!1O5iIO%k`}+*e_wTDqRS6|?PJ|y&wZ=*!tFG=E0npQ;PUs*&ZWYklUJbmU+)4t;t3oM1txPs*&lOj zoJ|#v>CPF!AW<=f^st=|cZ`kCoU8~u&2!yGCYLu38+>brG>^YHrI2!Y77k5K+hWKn zJ44*ltvbuTS&@;!;&<;n>3P@oE@16&6(<=nMA%x$e?_3G&Tweg{(SnI#~jnOi}T^ek4bLR{OJRRjDl2j3VQ7rR?H zq6aZp$K=c8Fn@6}n+8b$>oDDNymQ`bb@U9>^h(4~DXhLGm>1nu2fNARM_9P%bU^iH)~R)?5^2p=%1;x_T)n>b`rv%Kb|wD zXR+dlzp(H2v*L($@C8yuw2vL04+<*&%~M^iR{v)^7zU|kc}~m5x0U`dh3*)$L2rbw z8i=PTijiT5Rpq;7m6ZmT+GX;?4^Vz z1WNHwJZ4f!Y11}44)}IqBrG)3we!U>*~fVLg@TSphBa-%J)-fJntvz$I=<(-_Vms& zF>lSoY<-W}a8g86$Fj17fx(rC+4Nd_l9L|3`Ta4=14Pp6#KfHqQm6(&nWISxF_MuI zi*u&Mk_ndTsEw&`7Tw~?v8e;knXm$a8$5yS_)ODCraQr{EsGMjd`GHS2UW&cvElg& z*F=!T&Oo6S#DIlf4G3FuwZ5;BaRQ^pCuSii`dq<{ylv?SWw{3#8?S*K~1c!bfUCU;%IBABXTwA>85|>ix)5c z8y+4GVd~LD^TKRT!Jk1==hY=``>xgHwGl>9s3lmw)9f}R>$a2UQ97rS=vXci(|ZH8 zlA!rsNB~b81BX@mL*d$~Erz@UM*Zj+(@hYsqyV-`Lg59N~N-1wTx#eY}@N zVe`H}b?@f~nYqj>@Afk9WnpfC{lB^JP9ylC30)fAJ+bqFlSMV5VV$Y{!ou^V8`Tq8ln|&q zqYL@}wlago<$V=Wf(_g1fY<^RX7r7|LfHMU+`7>77aABT4&^)gubPud7Z9Rrdt694 z3MokZB3^?~_q}4}393w<^-O`6P_d?ylEt!T(eLJ1ebY+b1kF>{MeOeG1rT=3U;F!I z<$w3V7o0naNork|z1mieZ}jq#ty7h8yaZ>Yek>S+KTkiI1YM&1PAwakM`mt1hC-VO zZ|?{j&Da|ya87M*3Y87T$1)z6)dvg4#6;pN{MjJe-LZh6E4C*~pvuic?({0!tirXg zvR6=#m@RN;PPLk8ot>@9r1!sN2`|qtFO0b7uF&Qmkuc8S&Chlrp=Q{VbB_K z^Pu+!do-HkMsLpFG}WWCqhY%Y9}CH6wUZ>iIwp-hU#9){Qrm@^Mwxcxm1A-eAI9jpd^!2w_M#sj#J{mFauaZ5#_P+F30cm5OI%`@; z&2lnyCd8Mk4gY@5hP2^1^MpL(FMBfKwEQpl9b(D4iU*gluDAO_iJg}J{`}fSi&nOy z>DFgrgwD=-$|fVHzVqmY0qIWDF@oU#+{xlCrHv8Vdmr_iBqgmAq>|9yTR0Ea^5f`vErjCtVb%)qrT1R)=lx}p znZ|Ap`rU{^O!OEF9m^bUdH$O!dlnbtE!R|kX80Jx8#$~AWXc`sUE5I{q4{cst8kir zGYF3TA2>AsYRVtG0;i{@aZTOfhpR;1{c>#6QPQgYMe=Hj9S1LTnK%-U!$;cHB}QnT zTwBx7<>!KvhW;y2hy-+TP(IkAngRtUo3?maZ~k9SM^9t6uK z0u}0y5F*~4C<^~n;hqEfxW_rBcVpn+s2~UCv?A#0m_5v+k4c^tRt%q*W#6nU#vS1U%*agCttv5QR%MxN_l6c`$`y3 zsefppaC>W*GH-Y5pkl*W2vjj^Pq+6L*4Dg``r!7nHiy|ui2L_uspni(;4;wA^-nm#CbZTNVmJPwzVqG)0QB?o=*Wid1~Gc!xT z3r|OXe1*Hj)N=|F94h8a9FW=!hWt>!{`mg94E5|epevxeDAtvA`34m(%;*nUVm0Co zQvwB6F=ScV`eSTc%w|AoY1XQ>DEIrF4T~$d=7L9vK;# zU^p-DMVX!!l2Xmn)GTp>H4dzbD>3-nCafAxv&;xl4WSklq$%YdMN>R9Aa8>?^*It3 ze(Eznb98?Ejdy+k+dcbn$lGh??d`obIy!1iNT1sX{F|)M&B^>ae(3d7C}Qy@iZZ!c z4bEUc3qwg`ac7unYp!OOf>*V?__%N>FFEKeb{J}Km{o9hcQ2JY@EQ3t8-m_}_pC|l zx+S{oi zjyJ-$)DT?nh1;9;so^~rRuAi5VXsx8}Fn-_i%GlkSbb#lh>Oq;yUsWkt*3Ux zen>uFw=XYLXJ-Lv9LXyM)geC?}xU*x2YGacDib zZ3T2=7GvX#kHYOfR%6q|M#=cgIhMOLamwJJ~;eN{L@wT)T7U zC(ZRp`mu7)8?-$FD-U}3p5543E{&f|YCL4nRhT(1g^W`pdo5R3=m-2^!MFXCUb%u$ zJPlTgZ=bZIs4n{}0O8hy>xGb;oG5AKhK!Vnd$T#9@pXV6{^>(W$%C_4g_=UOvvaLe z?&~5k5At&Z)rK;sO-(UT{MRF=I^}G>@dPM80*jp66Lszyj;FZe*k`EXSxwFM-sEFB z?>`A`c0b>I+3WPow%uFTA4|}~JvFto9IXvl8~0@DAw%XVB>6sR8$Tsey=m4143*be zrX|AFO;ac{4u$CM%2Wx-nY(dLrC=`N1EA%K0!s4U^!;D`1vjn6 zg46v+NW$XF6AR&aCI#5RrIo*a{1H%%orZN*ca%W9^0~{GJF5==BK0k5&$ONQVFv51 z*gNaBSkgQElto!(t!(`#CJ~pZZ_%}&LleEukYxyUMXZNEJ}d>RSq-j%If{jg?Xizk zgoRyX=5$G&?bhi_7o=(&vfe=Osf$5O`bc*bvDGP43Q3 zt~bYzOl<=;Nl2BUG|VBN1C-RSTkb-{_q$^co~^$!>}rW*@Y|bz7vR-zJU5vclKE(Y z&q95xpwjw=@RpkMNM_)2@qt&L&Mv>)V7ZI#6HQLQrWuWmC#ddxCm5rAlU-7^$GZ9r zZzoH=_4N1O`9M&9hI^2)AYCM@vq@E18okv_Sn0|)ht~O#jSj>eq1uwT8+FgKuFiaW zre>(r{^Li>LXHhqZamQ2Nsm8b{y+cfd7Fkq8hXwWC5gIqE@?Tr-yM?2k4{s?n8eVLv^iuU)qT;NLiocE}Aj{i-6{$5ar4WVxs1I*~i$i79&hM+=w zd;5cW>$QG@GD9&{hRM}Ust1fuhS(?Y*`>vt0Izj`vACe-X-6U~_a8{8(sEdMBnwM#wy zH0EttVHZaJ8x!2}ZfMNAE~96JLuX;Jxw_`BGByEPge;4Lw--}75-+=YE2Msh#?00& z5}(Axc>dmpHye2(PDotWyqOPQI%%9r1^hL7!vz?WjHfHHp4b=-jdW;-lJo{B?Y+n$~ zYF?JU=@Wx?s3CMFI(0UGRWbZM&&>jh75|=sH#(gkriEy!kGp3sUc=ok?$E&e_aEP=SFG@nJep7HAI>u)r!Dz^pgvSNBN+B#T2eE1*$Uiv1gj0QOflkbu4lpE_3{5Ej1 zWY!vw3y4=Dm^kkx#gG#O+9o&JY$y{R$+(&bfI);E>=3JHe|*$;D8BdO>$8K(vTucN zc2|g=rn+J^KUC(7c4?;L9d^F&)?_&Pe0jV-gE>O4@O;O5G-kUbG4;>`U@n=%F%DXhjL_?Jsq&FWKm6{Sj&L zzcnL)v0Q%>PO7f&R~qdqM*3Q1lj~Fdg>^Vn7?qWKGZ<%g|p+}wv0RBUhg9!*pHyeP15A%2VcYLn_5v| zDinPWC?7UCYv_}8Q3pzC4d~fvoxXf;nY)b4V9}wrU_i(@%ymhwyz_3VF3UguUkxJ;WzBhhS1T1QDlQnET3qxRnU#Emh>l{2QT;fLr(>JL@KCorS)Y%On zo0LnmE$Kf3i^$kmjrtDl(!ft*@YPUaF-0@XnX3 zqfw!zl+p9lh>?!rvVn?~#>i`LXLNmNEQ6)k`6fNhLGeZQ?+OcSv_S zjCEZ^SxRw?+UUQ6kTgtU>#;Cw3U{=ywIyosmt;EaR)FM3%_=+|foTjsDsdz3z}AE6 z$Hmb*S4z7pR>b4mrc1>;uA{=yJ0dYJsG2HC4(Kiff)PFpCI27C!_@}5vB2B-0z$J& z83Q%~YvWyr6$QEXS=HZ9OQPeh`n1i#i4_7OBcD6_{W4(b6D$KhS|mIl1aM6)s7TYG zBGsdO)BCoML&Cz2S_}f+vwT8DJRJQZgR}a??DFR0$L|xB zo)h{pczQwnTksV<+kpB-qYlG{&IQ4+s!+-F8sL1>-sz2WKmdpxR9r13ku&Of)mnE{ zR|C^|!aSYgl{&_a(ZmJ!H^|o0KiLE-M;Ubx%Ax!2#6m&5xrY9I zhUWRiZBxZV?)u9d^khC{7SGwJrJCuXPr{dD!l7i9key(t?L_NNFFbYutF>{!??}2? zYTRGRMzkXWta*Ix5dg3p8u}7cOizzQCHv)YTy|{TNg7UehJ2}Y{DASfbG#6Bc^x?+( zBkt4uY$dEfBk6NIHt~%Ex+r%Z>sY3QcEUG83|{yQQmnYI_gs9mCQLm;WqjlS5=Nn{ zlueRTWJ8vzCIu0mha2cdWggyxj0qMsI0?43Hnp4IH!&E;cQu`ni2=F~%fIHfy={)R z)&)-EOO6)3hNKn3}|Ek;1N^M-~=^*o##bj!k*#5(3R>ytPIw3K*Oh~F_ zdgJ_iml2I@7u8k&2BQ~RR^+PH#hs;^Oi$vD0?Jn+oXfJdw_+vbgqi|#gDXHGfBsxo zJ=?CL9_I4(e3YpDEUs^3`IZva6u>(B^NIOJcmbWhm=qERB9)6TuK(>FH}q^bZk`)` z7AAGO@35=hNL zlvGkmScYZ625cR+my_+%r_)#G5we+YQF+dFApZs+m1oZE+XLQtx9T(3(qPe(!nN`h z%8UP*pF8pfq>I}2RpC-4Q;#FW+b6y9Ui0GLx#3bLNnfxFzPIaph;%-ez>F69A82c9 z-wduFr%NGqgIQNho1?56lX`lyMy_DU7!Wl|AkZnRbV)rO_Lzm^#j;)YeXTsdWREq+ zh3Y2RTudljx(=Qg!d$?sDAJD|0`WL`__I54`6uk0OOw=78E63dxVY>$cVjUcXbTtd z(FA3fx(J+sTFTC~%|CMS$arvDMZ7~XJmhcau9YCL8uX1!yPHo7r9aHgJu@0G0c7kb z-e6dBRQ$->HA#4H&3rtPe)@q^{+c0bi!Q5M?q*+^E0~(j6*rHXBRuKIt@TREubFr{ z-ciZpB{`F^)3&vPo}iu52**5t!H+7MXNyvTB3)98{4uwBc!1ip1hk11r<80`J~1d_ zJ|^1*a}P~7?;E=#Hv5_Lr23_M5ipRyTwy9-~k^`WIevg zSI;hRgVmu2&&7H!yY;per~Vctw(p2;u_pWqb}8C9vT#YrBGK=@{CRQ_ku>_VW4gM0 z3;Xq{*dEOG`@Cg)`$l4?@~+ZRT}Ugp1yPQJKJ$%mI~DY3!<}$Qk39fuPPE6YrK?Zc z{TzIyg7D(%}`ss0F|VWLLg6<<~VDr$w+sefUa~VsA+B2pg@mvq8|61@R3l%FQ;g8k(jb2(qS1t5#UrTxo*7I&_M{&@Za z)Re9o04$E+Ow#-1w3+O{&5d~pH24OO8QZKI#xo)t4~z44SMWciat9eIaXG`WV! ziU#y;??|C)HkJ6sCB%P1uAJv;3D{yfn@<1|ZyX1`y(4AUUs9+f|A1rltw`HwBSQm& z7jwG@PB3{m5N138HXRC80}lvXb2Qn!=5Q>ptgI}4a4BQhj-z{*5Nhh~zT)tYhsvaD z(wu`{<5FRvUkQ?a;(r5f=yWPW>Sif%-qV*n{+heGisk0b{U*(+sasfx71pjIl#jNW z4YmR*TVmNOC9LZLgU!r_1;E1cM<{X?e@Z3;ZNTNXSJlYM)3E{}U($5s(=I_wCMMKw z9G6w(Zg~nF{uswAr-O7*`{h_;D;3CNzv-Bk?K1wyKhwzfKx6YD2tVGH)O@?-qmaE- zXxk7k(o%wncDX>fy%dU65PPC4qqA9GxFbjJK+I!|=9wDJ@e{&^6AoXOX10rY%rP%C zOFi`{!mx|ubP&GZ`5VWk$~E8f2xTcV0N3k#S6Aza0al9coL{9SzS_{hq$4wrlhrSH zf&N~KbH*4Ovpc8m1WI!qSvWe*AS#jmQJMc+@)|vw*?f3|FpK~B%o5C2{CSHveXRx| z%VvTO0QT*&6F(`(Lc+IUW~$q4+U15*6IEHt1I?xqsykb0k&R3Ki72p|P3Cg1B$MiA zXJ@A!X|?{Idi=#M2Csb9midI_V|aWjaHdY0-F^vZsf~*M#^op?>i`yFoNE zHon}^ZelExS6*ITlbd@#$>fE0L?$3W#F5*o2o>j4e*rifa`WcfJM%s6jqm1C{J!kY zJriDI(|P@f4^mdWRy;rqpL~(3-Cb88Pa4c)x!qTmhZ9zHz4^T){~gl%u5+%aq$D3e zmew#SxVjj2lfJpCZRYMZx8G0LRYwbtGJ_ZYI}i5md|!^|BNq2hZJ^JCn_c;4@>hkg z=~)au{RS!rrgph-Y9mmVw77fF=D?m5uN!gCj|8F`S1{!BV1u2+QBd#n^Ucf8Pq!sq z+W%!Wk`Q#Dv3MHzz$jlq?75DT?3xTA$Aqx!lTBZBb5eW^Nq@qtWza{y5UAwc@ z_4o(IqTGpcQP=p*bmXFattMWkOi=x7Gn3h6EU8M-bXu$5RovfMsq|LtzPALHFU>oc zIL&8V3L_TTcope+0UwGEe~D{=t-rA2*c^RU^8t)eZGWDJknGLS-tG%toLztJ40uo` zkb4Q7iXLPMT6BNmf!Q};o#CAO%~o;?J!eo1;H%3RxwCZ@(?$>i{Lm9fgD=6paD_Y- z)uQ-731bC_wO9;D9x_Hop|LEw@|0?apE#Ze(4+?VuJ#a%cn%(q@%TdFf#J;L%o zd2TUz#LqcrKie(dQBwxIU#-5fIT#bt6^hBQ3a3r?!|W<$FpgYzZ(ubTB_%)qnjDkm?5t*KlbTZN!2e8X~?1pIpY$&D|&ieq9ZWK z?!<8dv;4ss^gou94gsiVH^i~vRs-DHr7^-U7KLM>QEI5d{lxdJs1|;~^l2FggI~u2 z+Q?j(!wtQcVJb&kM6J%^e!SX<{pC+qBO}WA{7Nx7=l%kZ>mhk;I<3`UU2j5n3=FQ~ zaB&`w&SR7i>Ta0ee_0FEyq};O* zX3c$|q}LwfN1_{zE51DQ@?d7JRD8EbT8@md#O& z-2>7qI&+yd)Wix#DIG(Q6sYMjQ97FIK7Y{pI0*892h z5PhrXee_GZKDngoqH?Zmp6=RI!8t=%&)a?ZFfS3THc;iH&}0?3l`e`h+L;vyO}dv5 ztmn?*KY#X;fF7sJQQYk!bg*iep*dQ2m0tW1A%{V?jEuu<2L1X<^2Mg|lrFi7V?sYv zcg7x6`!qvXppVLV(4d2Q2@{XK@h-~bgsK4@C4=#Y z5_qI+o4$Z<xNj5bC(pMe7o5hR9R}uAiO6SZLNn|awl3(m^B1$ zbQ3~8 zOWw@LdKyGx)1ZQ+Wn=(gri6HdF8X&nYH4wf5Q)X1Bm-cq*v+)haOb?^`hUmBRC=z+ zojHGZXfAdTfuX*jfjQGF?XHL&gN3@Kq79enwaD|{>|F#wy6`Ce_#vhrGH zu^vU>gMD9yiu-U ztR%nN$he*s=pd_Cbv7TL^(4mP#^=`$L(mdbSTFfQDy*bpro&0qp`xOy&C50aM!23F zM2ihYv|mD~A)P4KS@dPG0u_maoSPF|4TL3sfNS9YRO2UQ_$Y;sFtd!m{PC3-xzgQS32l|kdQ1_KIZr#gf-4f6e^!To*f%$ z)ufxqC}W9Pk@cSXHWoc^4uNs2^{*&@BG@17muNlF;xQlQG5ADW6K@JsfIst>?Sy%i z+i?_Cr)Ymu#lfmZgZA@)_Ns8g?%#os4kUG3|J4P1y5nrl#?sn-W2&V9 z%FVE`_~2qbSP*{U77C8ev?K0}iG>Pq6ORpjR=?R?^mnXOJ6gDxJZpO56l`n{W=&U3 z>$aTme`J?D7wFaBPQL%~lCi~5PX+hrkt+AZ+-{r18E{vBzj=?!KwZkAR-w6ZrQoR5 zET$j8o?_!0@Np=KGJ4OF(-H5x__yuDwMvjID$S_giBe2!aGG_Unf>{TiJlshC)_Ro z*%Ei)07v;$BBL`!m}ps<^cCP#(ZWpx7$SzT-@_4|RlP;!s!X1`-?jSuMXV1H3K|NR zv|;r9eqk>xl5AinGDn)HMTMJLN7F+}c8H_b&Zp3)Z&hMRD#A(NBC!YG##wU-UR#Q>zv; zPS|tvKs=+(T_ddgX%+M95;c8u3kyC<2%(4`t@74NYjFx0t~ioJjIcloq@5D+rC}?T z!osDEw<-k{?nUTXG4T%3#r&c8G}R1z!}*y^g{(rKFv2hv_E_rP_n*&7k2qSBv6)f8rFp#%w*Y-gVE`Y7p%YVV$C&vPr1moM{tav7;7)X_Kj;TVb! zL4d09D*biFC)Xr}7Dw&pKIZQ0Aw+Fw{gZ2tK}w@G&%W^; zTSG!p7un6~7X4r<2uvPsW_3>$x2-mRebO&K{gm$IgYgz0F1LlhIoT(Uhy5%!42zTS z0%QLl-TVZo^59)YlvK0ULO{CDG7~dgk{?FjJR}EJ5iw(qW$gOkc3QMEBigzb7?40u zLv=fHP{VK@ZH2&q;Y5mP-eQfm;>Sll|Hd60@X95X9bhvGW8<{jeNTPcu4FXgl-%}x zV^jmZJFR?Ftpd8f9qnJrDHwn9_WI4qI3?cy!j7o_%n5nId)A}C92*A(1@*zU>}HSM zWa$!zZf65vQse5@So&rl3!XN6wc(bn3ic)<;ky-ZqdNSrzUG{V<1DfHw-?WjHoD}S)`AHjCwv!V-RJJisXcwg0!#*a;5ZV5o54xCsWMlA6s%S;h4HNma$nL+yq}+j~K1Fdk*qufheD zVaF{pNy~(PcArk)DldRdx1e1ClXXe^Svqrb^GJZL5?BvExr1y5C9LigL{RRO?!1*G za2np&_$Rht`Gvk!_=T3(@V3W5XmP6Ix!A9hrK?|bE8xS|A@V&sOjUMq`H%5W_-hwO z3%UkxUVD4zb)J?%x;qggmX)pmgs8tio>aR{*YE_z7cMmd9DW@jR;HdH=r1g^i*S=O z@=C9CXYB`zbY6hqtYIK;;hCDP&;jJa`5Lm4v9S|NPdKD#!MY91M{3|RHGcZEU>V3IuOby&7lp{?Y%sz)beXjo|f04HjB0(fn)}F611X zq3L}ypC#uxZjKJ8f*w40pnFm(mZHbV>rt*;Hb?u5e86(l_i%!z85qknFbx)*IFbK` z0?H!MyiyIgMwY?nI|{`S9PDtOkqjunImM+P{^>(sQa= z%6h3P-i||c?yX}g`fE0qzo9v6WTagvlg(SxzOo?8%E2AF)FkTR&I$dtWkm43K>XlsG1D2U%$GQ6C+THnEEs#-0z8&XPE zv>F64Y&LZ@r|ph4i175rJH3vH*Mv!+>_Hsg30WsT?O4}zc1V|vmpsQikVnb6fr!rt zJC5M>?r%xaoC<;zPq<=`Fl&vU>)I810Z-SWoYFGkeI7=Ns?$vixPnNr3VY-6k75E@ z{gzVRjSU_*wUYPll4~>7HY}hCimI#N1wO^aUBZKY#|!O%Y`&`1pbaS6(EyLdbriq6 zKVT-)qOW&R4c-=4!d$;Te&zM4k*Tx7I(^yj9xsRvM5cw|bKR_rmVb|^nzch5eE|(0 zUl~l)Rrq1KWuOKX0YS-An05dAiW~l5zNSG4i-eU8Rj2@5;zN;a});8+H zlGU$qd%OOxvp&#MePWy<2TQ?w|5zj z#g+76tnVt!lj-tu(*5SEh#_;5QLm?PMx)}ihIr6oBcoA5xfKH z^{Djpbi>cFboE=OQbt!MGC4zlIfj89vNsK%c&>8hFR>_?y2#O=lYr3X-03MuxE z@YW5A?yQ|)1K{2-8Q#ffL+NeS0~g?F&G3pcKEx%}Ul9I`T53?blQ&GVn=)zOHn(|i z$IDw(YS1e^QNn?F6}w?RZS?)BB)MWJWF6;Wbze3-TJ z{0EY>JYRPRKw?mzG~=@y?=+yoQ=gkS$w+PE{%v3y=edmhiJ;Pb5mjWlwRZYoiy`iy zks`h=DLwDiQRKa8#-yizi|+m^`B+yMio*-DLl8XU~x^{$6yv};c5d#JA5zIvB;+ZL?OE`m$RMYG)kmt=`Dz_ltE& zefMEIA9msVS|5t_jq8>9OqOy18_!s5RNEA&0oX)>_9>dS^cG!YNfc zvKQrp3l1!2^?hhjm5^ zpuhuBQPFE2bKZuz{-o)v|0_2Da=#5p_(#49(?$t#M_wVWxP1~XhzV~KhmfoqBsy@| zqw@FDD3J%pf-(W!B;T>J+nhu`vX_p7VVd8D1fS4(GDQ$Mx zvN1G+17aXz6xy`FzyTz^JJ)QtHYhB3M;nHM(Mx02|KK>{p&BN)UA};{YKmq-`)56 zFkE%3eW~l5OM1(IUxtiM zh%yr`{R9wD9jwWu%{lv#vT1$u+}}MnZN6*|PCO~?ZUDS;0{`X>J@X`&zIiVhr%MB&#|89L zrt@j%ZV33}_4)*f=sVwvlCy(cm~|^JF-N~j%MRkia-qWoS~Hm~egip?3wQ@$U4p_={!smmt?QbWZ&Fr@_wiBXm$mLML338+8=~Kbidi5C z$oPUlihU1^VR24k7P$_jPoZGSy(Z~SOif0cpU61KXjPuXo1TG#KRg=xs=!`oq99hp zOVLk12h`uuV@{jK#S~c2?nFikEMEA*^)>(Y2oDUR=Fo!W%oFP?I-9n{iYVkJJ(>(} zf}^$oGok~K<{k*B^LQryJbA&-9LzFAPIJFVs1&+FKRpdDs{yeh_UW*LCr3Aaprqe4 zG>{uJ>WiwXswxYYK8$6wiR6~T{xJ2-#NfpTrQL&_T8^(LN!DHLu_CwMVgat^iEMr* zyb&|K!@zC-(x@f&IL@TN{9(B(=Jws}E=?dC0N!hd@`ZtL;xMQa7BwfbU(;b~Y zUO~JlLh)(J5|j>NVs}&cJ=)$o!a2@7Q3bRehP?tz+2D{?%U$0TsR}i4&qBbB1tAGa zuM}|1KuP1|z=D91(m^nI+n>c<&mq^=Mm5X4tAnftqbTMzPG2|7dr5)3(tT-g;Vr|C+8$jXE`ZZvb-i>ebISbD?Wt)o^Hq5wI zBx^moP?OnBh-ZNVTV$+rH02|Bi*i^cn6l!@QOv2SDRHRns*h>xXTrVNA3UCWW7a^i zBp)7zHsu($P$ATqv^wg$dRNC!YX6ig=NTPW3 z5psc8InnCgB;qIp@)60hWclc90&olK(PWb#-+PRCTJ! z76Kt;ViRE#dMbqHb0?Co%byQ#>u=)x2*UnyipbqjMyNh`Ta*f0SeVNZ3GeX>3p)$~ zDW}YdH9nwz(Q4>N5-U_R^iVH{Fx8w@pK5keRSb1(9l=IM*b0Q3XZ?w3Ny z01pzH{bj(o!fv_@ajg2XW{CvGsEnvaO107DB&}!mjkfC>8NEtMOq4y>B#3%N1#BQu zY-8<_NUsdzHiiMKl#yWas!XHN;C{xdzd_RB=@Pd7c*qAqetv#c$K=!N!F}O!6%z#t z2e8S505Q`+ciI%=ftNU@DPKqyt8wd71TPCO;}5_%MODRXkJ>e4b}4ZoJ-{-Nz5w!r zH>3xA^gWg0HY?RH+~HkQ{v_@a8xGMvxyBd*qFCY`S2Zw5L5E7Uo|;4AXGzf2Pos<& zCT~DaLCTr>(2djlnUr2p#IYFG{|JYY2nzUEU66qOX)FUc&W#MPofW)>D~4(Um5GPf z>#!;8%soX=IOQFxH=^gy7HtJ8=YbPYCW%Q&k`vY({NHxaG-u%Yl*oxC`L1t+;~1|M zI2rzsE%tuRpRLZ(F1yPcDK^ZcS*o2yg5RXS_Lv?6e;)KHsuYjuVo|3#N1wcKLj+S2)Mk4AT52+#_7 zo1%N%3CgciJdrKoZWsM}Yzz#TnXZLuqiAUPBV7uz{FDr@YG#8*6hKP507XIS!mO!g zKrCUHT*l+^OCaFtjHrpgNu2U}`|7VBRGHFGVG&V2P~9U0bfFZmVL>#_8Ify0dln`A z97ipya0BUc8p{uzlaKQWdm43WfMU|Qn6JK@61tz8(eq^?FT16LxVA}JZ=yW@R^rGu z-z&(yLJWW)<->I!TeR=;kcdgrxXZ%@<2>&BU$0abkKmqt$7|M7CiBpqvi4SrC%yno zc}A_yVlFpkBst*V36l_IRU(eeI&4HYsnF)VDto=$K};lP*)1kTNQ1W`@X0_0B4u`k z@pjS3^LeXqet;%;lB=Z+)+r&74iYOHcn911Z#w@<9xR>ujnOSqA(K5OfH?;kit+Gk zb;)UIwOma~C~1nO)r7awAo8Qx=6!fe#Nd`z!L|kzehDALLALuub*GcC3qtSK=1Zq_ zfQeg$Hu#dmJt%!xVXv?A6e0aLxPHD*2;=jG5#mwwz5VgggfHpkzzWcrW?B3%EYu$A zgF;;pw)*JNqqu80eS>x?l=NkT^`h(3h?ua~2F*HO_(HVC7HF{GSg`Reg#_H0Crp@L zbIgK&_PyuyV`Lj`Iq8UNT~VjUXK3IV=SU6xq`lV$OVB9r0k>?;%}2bXb=iZ&;XGmy z$uQfxf&JP&L0xNuTVDus@E$YhL0a9G7_zaAum-fk-AbPr`m+CrrY`}9x_jUMpitI9 z5+b8$%GxCRlBN_%jU;=r?_}S#F|sA=Sj!rc$iA0C8DyQZm+WNUxBuh){{HWE zL!YSJg(>bSVYr1B?Hi56w^2A+{N22a|NhmlZvRLAoIWhU{OHO$pgA!iWXk)ua94sS z^)O>WrFQc*H6b@JTp7``>)X5e+a{+k^_P0zz&NleM^oL#xd?O8?U|8wn30_5zAHBJ z54{i9G5?`Qf+(eQRX54K-zTJQoxw6O9io&^&|IKeoLQ4pIm`ALsC7XJ3F@FUEBo-^ z(?GdC8RkdB&#$(<=-m-yL@Io0i3-y-7BQU^6>eYSn=ELs-SSOXI)eG(+vp|XmLhhR z4hc)44Mn=32ElWgTUPmk2~QOFUe$In9YnK;ZJ*kHiX<72g%Jf{9O=#;%v$sR&DxbV z6__=q^O@AMvzI|-A_0GxCdS6(iBVC`(x^dXRKoJ}`x`^^Kr@~B zt=+xLdW>8*BuedgO>}x`ktWJS*=a7M`UGQ;5k2)&SuLS!7j!(H#e7e&dGg4_syklex1%| zRCCf~aoN;+2XW+@jeOz3RGRY~e$7%}Xjq`FhB1o6uZ8?O|LiRlC!G;`S{a9tPW8Ne z`LarXsR|w6xNht!4;rL;`-et4A=TB->^>~3w;-5?n_g>5oLUl0t*)-ViQeJ@jl1f; zy}d|~llPsYuAAvH6Q-xC+^@xryKUR`|2y-nTu@fl`v^+@apMzz6sLvO?E8}1X11MR zJ{UT}%orpHr-3oa$J5YoC?DRu-Mr|O$ilms^|B(-34@Q@FBxTrq;JKN2H8}fi1_CP zyX*;$!`yV#c(04cni((w;X7LuBdD#dJxqB@z7o2Tpxe8ty7>k_Elhq=hVCz=gN`9u zhDOQdPOLQ|%GzO?@2cA77JYVb&jSm4C<YG{{mj1TcJDjeF4Wf)n6Ye}%YBd;isrn18-K8>95%>9$$`o`2DxT#W)@~y`N!t36E(JO za(#RGm1lg!@ce#=nGTt?0|iHizA8?mySKOYwxMBdN2^bvTWiJ>jnrzVPY#7CWr~q^ zwYnb;e(xJ=_jBzf+gpeyi1v-qIOgSDX2AiZke=YSU64{fG5*;|^;fG?(z0+1XW%}= zZylPEO~4Vai*baut6y#uxj_U?1$R>gY_Y~UmOJ8V|p9#pz zhz9VISDjtN>ILBfP>KQZva%@p(oe?9RrFagcAl&F_yZ$wz{e*aE>}}Onngw!}!Je=O#jN17@=Ja> zy4Pr(DwrRNplVccuZY^nhKjGRuR~_{bG!Tzn!etjdz1I~QsZjs*pvhUx(JpD(M1rC zfHi|SB^v!JXh`3x!p#&fQ3PI2_7xEXGR7ZJo6g82%QwkkufuAF%iu7e!vj0Zl?K zjyQkh5T!;{qKJ5LjJ>~$P+jkq31+M&d^G7Q?l^155F)bRA$OdX?;dkU2$F(5uU!Q? z8lXZ-ny1)|k2%O1?48|5{YsA8bE9ZC*qc&CG{l$FVQzBzD?$yT4hHo@SFwq7p78cM zt$6pGTwq~dd_}GFwsg2AsMfOrlLL$ITlAfX++3ZX@6ORY9I*Z4{(?U!OEkK5=dF(J z?OV|e?g?AW-cik`J4C5j+zX9J9HX1lHO7Ns08%te_vmI%Hm-7`_dDFeXv}$ zS(R)l$}SVOi?kq>{Bo7@}MAjHhI z{!EWf$SL8;iJoSM5g%esj#0fIEAee&IiE%OCaW?xZ{3$`rrhrS=6mWkyRTp)Upfe; zn@{_1#VLMQsMgC4rJvumpAgd(84FxTlw4g`N9X#>Qk8HJ>|PZ20$uY$X!4tAG~XAT zxqaok1KIFwX6{4ontU!g1m~h*lYCqCS1@t?5qA^i zj|V#U45YXjsjdsA-E1l1&R=!UcdhAVD-hZ5%sdt^>2S$wjdW#yk4oI))`?2jmaA@V zW%rG|@=7J5A|ie8MU$ymi#BS^*eG(1wL=QAs7n4O<`&!Qm2Ll{W`!_baH-0;!7Dm? z=b77FU%JDv{D+5MFU*#ZUnVnE9HtZpOo5B!6Q%RgY@)kI%H&B`-k$H@tG;+^TmJ`M zJ=2xe6P3x*0{9<_^drWYF=lff>JfePw3SY;9bzAaS6~`>oP2UyG?V+O;-0}<;i+n{M~=#YS1D$u}yEvwgBfd3h_G!DqCB z){!vHla%#3es#k=|FVrwzbYH{J2sK*&OJ`TG+^81o&b@MGcz++tIbYl$P2luJEM8? zQ{D2lTSD=L#;%qf-`P0(jkNBBxN5Dc6{HO8ryIztm~OON=DRX-R_!a|3@ec^ll{6^ z_>@xC?4-X^Z8(HdX+JP9kPZpuOJl&Tyl$U3Pw}DE>SNy%fX#6$mTDFjk@&S`54lvz zjW(wwCkQ93L{Ca=T|4P~{3aigt$ptvql4#`9jo$PIzIcg?cHSe@$&obn|XPXe{%EM zKILkiU;Gm?|8QrcYVoIu=#`wtb;aGSxf6M6k}YlTJ8e#9MGJ4m;9HQJ6i7>v{Uhmc zpM4f*v?mxYx8$Mkvv~bx!ipTMlgsTXp81dbLYl>~?~8Bx=3`~VYR;K8{uhzo z7r5!#tr}!@x2WdqBC|{*G0o#ok!pcw8(ONKH?VCTQ1z_2Y ze!*<7JE^<~30Xg#7KOZi^XAQ0n;|i*bojDG{_5Ar99~Jw{3h#s2Z|3z>rHfLI#hGW zCi1)dC^J-tR7+7TEuX-}>bZGMDNZfBqXFD_7>wYxM-#k4Br=JzAt9|vO}Yb8x-|UNvR38=BGz~yz+o~ zA_GNJQBzm9GA(nWOSc_Do)MyC`Z$}6JPV{h(fORvb|#HU*DGxN#Q@V7f0__7ny9%G z74(l_7FSnV%C4EM5HeJ={qV=@Kdt?Fd0fX&i;4_!sk4816gIUcsJOm=?Cb4Q%o}0E z)c>Z-MK;2?Tv%Qn2=G3atgSB@50wADU-(eEP0pEPYi)24z6Wbg=bXcw$bN4$l&#vT zgKQ_Zpj~5zE9OGU^JVU+(>iVThMSE0qL~uI?Ur9JvK~SlAI8Zz{VE+yCID0G_E{GP zn-o3TJII63QK@~(fR&XMO+*P((zE#d>({T_?8lCMosDBfdGaUkbII{&JZFCRId)r4 z#`<53FI~T;P}NSzanVBv&@6ZDx09sJkbuWj{)*yySFTX0s``BnWSB9DwbHv;rdW~b z(!sRzzF(14IyX4bX2M}#wecsG4Rc=zx`7sw^ z7U!=QjgN}P*Dz!DC|1s-VesEKq1bqqwawD;I5Ox=ZwM%D2+0>V`OA>1B59f}SVOMfTByI8?-1*D;Gop1!K4%cA z@Z|^RQ@1Z7f=D@gSZdmRoC|$GT-iXGX7;TKnQ*MW&2A>N&Y{=%!mj@cD6R$jrTqoX zheC+!#X{#(iTzZ~r_sDv3rd2YT||445N`mc)Na`M=W83qAv;H!FJ6Xwle1SrV%J}7 z_vuN6$NHcZ65zK>4W1|!P&xBr&8s7tc)U+Q_52yzezUPlL!(i=Dd`#GNtworCYpQv zMY08cZ`OFI>rNyWwnEl648^jel+XDeKj-4`r9Otr4cC={&sIL2>&$%g`N*o5(~xIa zI?t0&ypr=D*N13En)zF|RDw?tt9I>OHp{MIB&>RJwYd8%!lCz8IjYf2=g3Wo~#GcUZkmTPTy8KhwvMb}>J?=xDtzAa6zQCMX7!aVEi* zhXT~l>UB~44i^#{RB#n7M9>NWNGZ+tTSytxV)vc9pO*aISpF=$lx(1rX z%?Vy(IMke0I>Pun{dQeN8YAks6jGU>g!|>A@49-;WhVBig71<^f`b}>5;M~_o-X>L z*2OQPd52iFw6)inQrq2m0vdqHC%{ZwJ?|#aK*;XN%llE`e0;!fn*nz6sdMMpjn@b1 z9QQW0ckC6@F3HP#DxZ)pe9*``$}Xf^@it3s{j^S|<|6;G_$IJ*zWBXrx3GQJ0wYRA%t;E=D5Qgw;qeKt*b0nrG2+%RB9BsIu?lI;+tsA zrEj_3>x$x%S)*G8W3b@@Jm<6oaG=`r25j~*mHBw zMaJ|n8dSMqulhYvD=pj&?gp&+f!Ia~a~ivniurr6j#~hwOsZY~flWZ4EnfpE9bHE! z@O821w=mrvAO=bB&nS}5Vb1wN0{dSqKb>dIaoR%)4qxe4kxmL9UdABFcy5 z9VNHc}?w^XHH5*TR$ATetiXa<7Lt};QptF!YQ#lW-1Cv^!#2Zra$ zzR`Hd|G0GXMk45cQv^)?cH9ulU)|GV-`VevvlrSl?WxSK84s**wis;@%^rP1_ps7q zkvCq%f5}D@)Q@7zu8~1y2ZyI^Ap>aIdS3~dY_rz1h(3!SU-YPbp`f9mvHzH07&b6T zhpwy!JSP)Q+fr}LXaun!!pC1oW?aEABU(N)y4_nY1!xhBNciYZ9MRu=oHcmT2-j^u zD}Ewh^P?1^gzEw($v@ScI{Unz5tU1ip9qeMm)1#-w?l+Gvi*Jj*|Qa4ZstXJCFsJ zt|ko6`;&{_k%QmXk!O083C<r-`^XJ~2PGvErNt0n9Y<^&N zVe}jjMyW-NB}XF?zyrg&LL}Wm24C;8fjwj#o2bhbY(gwsBCEKbBs-* zh`+U^bAvZ>ii6&dYGfGk0Rg256!h29e%ci9d5k>v|I@(af8ntIlN)~vcfb7@a|rib z9?vK1?jNp~hY2&ujGkv_XD@Y^)+K@E>+99*sey#gpFi^`6Q&SDWs^_)C8JV)`KupH z!^O|KdtaOR1$^$_c{&(MOQ<$VpY1DzCn!TdRq9L#+t}}Y&3opNn(7> zUSCKg5)~PF^bOY)!c4X&L{qYRU8jBu z56(#;0jqh_g??tcMW*vh4mU7xQadfmMKmolq8M(%Qoo_<`3!HJ)&EPE4s|t1Wbj0u z4I89lv)wseY>36xOY*hguGB$yT;yF?kAz9E(3ak}P_6ZOe`R|Riu^HdU6Ofcc<*E1 ze+xo*i{f=w*0?zYcxu$A3L99VKdKHvbEBT@^cR1?TI3kKVI@qolKJ~)LH?a-~`Ot;}y#)(o*wvx^x`mam& zPO+0;#I?*%t#3pZF8_OXA|ZJUU?GpAHw$#v2WNAJY597YJI+IN_yv(N{XbA;YyV+c zIxw2j?fQX8>VmFRpYf9F@g8=vkg(yNkCd^S(KB~ zO8gq;Y=w<=dfzrT&^&bV16ToFNe!ytr|OB9l4U`qrd3#eue#(vB*=g2kEA#A9SfvE zo~PXrGbYsJlyZ2XK)gX8u8B!$j?Gie#^< zDrKY_TcQWn4|%ti455oTvFbIcthdY=jP9x0)oc3+BKTpdg&}~ZD$Pwz zlb9ZGppIP~I+TRY1a;gL(CCVZ)bdUxV9T6Ky$ujJTKx2+zK$}>x z5utK5S7{gjzE83FzDl=;*N(6fj2M#V_jmL*U0>qMdwN{wq~8AdkI6I@RTgQfgD4fF zzU;2)=UlO6o%^Q|1T)ryTR9wHYrYqUSIw~g#B|YZThmh@)vom?xDJ%++Q{FrF(8q= zsp2Vtv0aPUFM8D+!+&?rj23K^{a9#{H5qYRwT5ow7($FSs{v%RL5!YX+$x_J+QCA= zLRNduEW0ad^7&Ay&)hqVf5oW+gBihr+O^~^$NC!;b1B4ynU;Jf1->tL5hm#MM0D$> z4s%YZySz<2ll1?&06;za-r7Gr;JI1StBR4R1|BpGmWhhF=F{$d#NjC4xs2#hD1KkP zq&r%AHEYr#t+dqbP(TmD$gUjf2o>zbxZ(Nrs^m{%eXpPqcC7x+L2jOuEWA_oKB26j zNzFir>8rp9qf5*38+((D^4O#+s=qX87g%LK-{oKw1JGI z9*bvml73}u8wKN`K##=ghMxi(v4Aq!}6 zYxB};6Z^)58*!x8{+Q9{%-hf_McqKm7#JA#?u#zOyd>DuXWcIIKZsH9-{o;G-rg^B zAKL&Tmb7Y)xN6Qnl3XZJjlz_o{0U)P6*F97w@pZv&W?T*8jZJGT#aG=bA#{8+k^05 z^{Pbp!qt8DXnX;chvd@ANsO7GBpfT6>PL6a@I(kI;j~eA7!lKq5~Fc9g$7HTXuQ#z z!cn8D8i4Aju4B^u^+pOi`^Y_kdcLtK$#-h9-l&!PT4oEbyu_zm!ws|CaX0tNhL%n$ z4DQ8!)0-~Z1N{!FUz4UQ)iIrOOt%Vj$+!Du@@#DKQ$s>`6}k8G^1>l^GQpVTKrEA3 zz8l5eM5I-MGWf?2N8k$KiXRhrRyKXkJb(T?xKP*l3HIv#J!m|nFg*|ts3pk(E%;hL zJ7z`j5@tn|Kg}>Qt_#jr$&!}I%Aa5313SzynY9nc>mG7FuVYSQL@1<4F#-A4Ri&xx zZJ>SErmj0Pdj8xwYyI%CbCmdpw*G2WJN;=TDQ}`>!in2d9^7G?6^w;KGgf?GSo%u~ zehw|F=1`lb@OsLm7Z<~!kF2sw~I)QkZVB>DnrqQG*edCd}I7LsL( z4K+BEnRF7yXNRBMw|^mJPK`2iZ6--{Wt<39EAGW6uq8lCAw zU;kelN`w9oWmy$BhA%_4tWKh8M=!l3B|EyZsJ2#tseU-#6N6d>_O%mc3|M9D{k>qA z-NbZ|$90=xsWwFJd1^+&fG%SsqKN6d@`IHR+kFR+1oX_X$ zC4!Qq^!jgJ;c1N+{G+HRyd)Cxs%SC}UrYT!DnLDal$(>DTCapFY4%l3KW`qOK_n>K z!hrFS?>gVSQWk@_l?lxp=ED)0|6~Y9fPEaANh_+A2ic~o&r(n9{VL;rPVDNwn`OR{ z+4wfUh7o|0&Jr&>0E8wG5|#@$16)^fEJbgCz#-gv3(G+R&|}5s&lF{pGP`-*Xf4Kbd&W z5I0$olMn;4Dw3PT{^XoEtoTD}wAZSzt&;_Ep_icCmJg6K-*W>`IT~oq#n?}%Qh)26 zGx}J~Xmf5xyQ&8YLoi$!)1CQ<&>vo)LxCF-*8SnQHk7(u5Jmg8-rcGC%(mN_Q6geC zKH4FLZCJ&B_lKPQx-S4o(7;&f%CsPyh**oK{S~jS7P)Re#op5j;u?U= z6V|+DW;bwa(ndG7ffI!1`y&0GTJ2%hHy*l`L5`{ zc|6Ik$Zq=~;sB7NVrGfL*Rkt2FglgHHnC*LSSw>1=0!ca=7jpM;N=WiVqdN^S_odlvPqo z>QTBO1OxhXA}_aLwG9$sTWAz3o~@krIm;%KLR7hGDAPZ&+{&grJO8jc1J;nihp%X% zsOQXal`vNfU=jzTYa)bhh6x2MlVZ289^H4zUv+k3`gKz^#|e3YczMXBfCj>ZV-frv z$=3O)nyR}J+>GuNj?&nDZck~pqvWvw;sA-`%rh{aE`!%?lofrUL0j)^N7X@PbX;ns zd!K^?oVz)Xx6-Gt+K=B#OYSvx`}tfK}ERa2H;p@{lWbu{ebmYiUw&(E>z*IuzG>r*RH zaeN`2zw+kCV_*QxBuh7Ct-4x&{jC1J)%=5(j__w=!vPmPF%JD^QpePhc}d={oOa+C zr~kGjWzvliC~q?#Ki?Ych?E?^Qqt>Co{Wh(ZC!RykINQ0{{F0+*&T!2QWL;t_&>-dv_y(n_1JUSvU?hi7OP7*q{*j=D zptG3@t{^MB(QWU4!DTa14JtiYN-v4p%pH&C`by~7jvw!baB=sDc#7CD@OgcGWqWNd zmBR5{fGhN@kI^0~tvHu%Ov|?>m_|lcf^94BU|ikR+BSFQ>Q2tKtW>*iY}UwrS+4sx zzxNSc?RItUEw|1(;aJK_Ow0A3l5YZGH>|7$aDsPnCn%H*pZ>oBp2~Yv=%K0u`qxnat`6)mK>)uk7=KQnWVjNrC zn=8>K@A+l6riM6l!jr1s`;mLg_azU#(Rco)p0wwW_gs2saHq`FKx?pDZGO^ybZre_ zrp;Kh!-Tik0ROX@B<{Uyi7{ZayP8#*c87Icck*6f>4VPZ3YE`)1_#?vT5-LLUklgQ z-va-`rlq}o-OR$GE}$955e{zjR6vpPd2Lx)d(>|&%1k``!am9x5E>t}ZMf#q~)s&05n&;K<(ZCLD~3a!%xlsXl-8|RiNoM$H!#UAG$VSU>((Hhd@soud0n=J|% zxPsdydLG6O!$uUSN^T)A^_Nz$cAUf&dkVw#Ban3M@xpU+U9gFX&zKdNI}{M}p6+h$ z|41-^ZJ{)>CM83EzQXey84($~NkTL`Vdq_Ct_rJ(Izc%~SQSV&0DCd`wVMz%L zzX#ZZ29*=qb(=`PNka2XHoZL&9IHQ4=yXS`#X09}a8~)lhZf9D-|Zi2DH1rscMZGT z3Q+ptvgH$nk#TWx#Kc6|SYi{)9haPiGKd1J!2PK!=MH{4~P2reQX?FqyH>8-ANmbTfB|D--$$+YT!7j93U5 zyo5;vX+dUZAYRAd*9Kyo)p^7%Mgm>i*0T+U+sPk|?g;^jQmN{e;mS{bI>E5Mbl;z! zwpj~OoE_ELtzPU&B^(eBGP~7bSdipl5o*|arI1=hFTp-^ca}{rTt2`PGor~iFc$D{ z{3wGPID>f4_oGtTR{869RDI{&+Hg>}KZQ_o1ROgIX`KT5iS+y^iHjGNa^@EX-Hjgh zGteoxv-h5`u2oT;FzoB?lW1p?yJ5s zZj%XnmTK)LjVo1_3nQO=FI>0~D84L){tlv*y+GnVDAf&<50a}g>_0;EBl>`kWj_J9 z7(@Kt=jZ!o0@hPO*YWTctYWz4!$m);wS=^DDPW?wm$#fvIOjU1iK3hi>}OxXJq$-6 zD-(P8zQSay<-4kw-_CVzugS=_?}rGYwegI6YYhhOB^&c~C%PPpF;8ij+FF~NH@NGO ziz08mUgE8FoWpA-wtk(Iks3Od9WgTzV%0q7G4sYq()K6YQkkg_{N8+8^Sd);l;v8q z;%xN*y<9+@j893~E012XYjQG#dS%~DdV1~VBW{c5;9-dT`~j65P)&7Z(K09NpTdP+ z`o_k_sgT@xl|3ze7F`rIDn+9k>v{^yOZ#A0z-09kX&CXSt*qRNIbfkXidrq*Q5_V{ zWJ7L2cbu30#D%{BkeHQzS62&+oZZ_VCU=zBNa}mf4O1XGI%RzF@_fCX7eX^G05GhI z8T*7NP>B(UigEb83hC$mUJ^|I#Lqe(3m6h-s${5Hvn7j7G%-*i@o9>`tl*H1M`1+T z(P<&Uld$J*y~I=2uakE8u{4B9!F<=8mPiT&3c$KyMa3nTYhbMSu6dL-PDiHbxo>fh|*Ja^e~uud2lWUta^- z7)C4|r_A@@GNY#ezQZjwhO`IxVEj=&eX%OZ=uqOVRfjyw+md?4rlcL4~ZL zehC{ekIbUx zG62QqKE&(VLl3euGg}w7cWKH@V-Gi6KitIY&sn_3bI{p#AKv9y`!!KNTvO{}b@-C) z{U8b|>Wb(eVy{|AL@XJ1bpHhIFU`Ux0BDRN=g;dgAj$+81c#FbdJQk^(SHK=aCwu) z1F&2^hU}f#e8QYYAVxq3(a~Jt8DWj)gCgq9x5UR;jFzFV=K(z7C-9)0pmR5vlNv(|CKMqXgtM$l0@!WD zlqHf$Nfw!luQ(YXgv=4%8gx0^ZaS-kYze>EK8%c0#kZ5_kZ6_;xy)`K4kyV2h#CL= z4QWRhHP;1Cg}Zq^MrfGnTwO6Q5T z#(!1EACTe`3tgrj>OcCH1CI2PjzgYtRVVw!UVX0iT<$&2&d#{H=>yFm3Wd~yqjjkx z0(A9LVpr=TX`$UoMr!RV^Pwh&JZlY|_88Y4MUcTdA4ifYTm;f4N8dv}I!Iw_&mbX8 zDC^wZqcXSGy%|9rF!>|C*=8Ge&u|cQ@VwVzcm#e?0@XAu(}d#<~u=Z!(t zn$%ddv#}2QL4Vq4?-4h<@K(=Ub717%?=3Bo(8S@HTHfqMb?{~t3Z|v_)hP~#*%m%{ zvBw4+p{?_r*=b%5D%%IWo^}#x7GmuWC}G2SBC_QJZiSD!oK6eJbw4JS$*Kdgz6o6< zN^?W@6ZeXP3B4eJNi@^pdtJCOe3(Qki8bpHa~8&uUpw(->mrHq6@i3YesZ2+nZ3a( zyjjwq9VQJhuW+dDNJZ~X;9SH4m;n8V0QdUgNw-T41RC?kfr^jfzFV@*?RDnM^Bp9T zP)2rkx^_YJ@SKrB4T^?)!9p^P^BTT|L>d?m^(N^%==267exW8A@#0tgx!5tUFQqwU zRvNGnvw0B#kQ{jHoryPo4JO*qlT(ClT06p>L?lYreE6_HtGU5t_bE%gr2i3jnca53FwM!gAt{Mx!OMT&qN=u*M?+zg z`F+jWynlJbH?dHl&Rw+OphO)LG0gOKnQ?V|$Nx2Hlfk&S`wep2$jIS^AHM)dN+vE| zyvQBeSi-Zmyy;_f{P^*zQl%HhU!I!j0H1;y*c65}OF#0ohj@?rH93iyeWyC`g^7&n zYRJq8j4ZB=@EQL13_NX&DryJ9ucX9duhtc|pg+=#ApuRJInJ^Xk(CD@A{G9TH$WP9 z7j)Gz%xTS#ssE03lnULRPYu}F@tYi|IL|JU6p^EuDj~Vdc?}Cm4-~jn*yUklhvipw z$r1ieGFe3aLU_l+Mz-BXkx2Z||84D>$UavH~wbEaPdd=`r z99i6&Fr+lh{Qg%E(OoVRz+IEAFoU${$|Ln)rXlaZdcL>z!ZUhCWpZ&V>`}O_729Y$c*%E8aCFNzXrZ zHuILnxGb>tQrMK`_%lh5;H{LbeU1inOh_~vDIc}V1rrqgx;^qQRI6uPz_gLnK&!C@I8bsX1MYoWC z_G+_A%ym}zgGF(JSf7CA3RhrcH-xPLAQkx{)%1{$>ZnRL0J}{;+`L$r5B+(q7Qqc8 z?}VFom-|H~B52=ck7#@RJ9=1|42Q}hyun;NmcsGob9#559wUH$|N5(F%J61 zGhxOY*I1#@q36$p_GX3d%oRkR_Q5ekF#w>jF8ueDe_2?32V5#yNy}$1kjBtIwI+$& z1_6{)Xz7)Zd%2AE7SKUbxCv75YpQrvd0?3hrxH`4-CRjqFUR-gB-}|U_8b=AL3Vmf zc_0pwOD>c;@{*zcSK~hhNX$<(dC2~Clu{S7e-8Qrdj7Tlp7;GgxyX5-z&efH zr%_@@fp}_rtxgp)e@Swivr3uPsXBUTH8eDI4Cit(fRVJrYp$SVGj}=4SRs9)SUEf6O3;35`862RN1%zs!9 zI*vxJ(pi!9w*4Zf(;!0^3E=6N1JxyJ@J&g|FVpoEfe3IFPry&-mp``yb4TB(CRJ^m z%UP_y$TKNro89F83iup`gVmbedXU6h=0{kJ?;$*gZePVgo^}^A2FWayGJ&$pnKSs) z{JJ}|nvN^)0MMzEkvB|L&yD!Qof%U(B?_9Dp5l=p{0pR8rE$Fg+~Tww6@8a?hKJS} zsTMAwO8C=G00jFF=t|ekNufr5n5N@KH=F)^r33Z<9X-7nHzDM|b5inMXHULL`B_-M`IRH+G50O(xR8dkafwyMv^f)p@lRCrFsoia{+3L zERd1W)_34sE(UV9Rq@#o;9Ta{Rd~s^3&|3=7)^@Txhi&zVJ8I;4R#RA#sR z69P;vybu0gceNe9XlW-uO-f3EOMW#5oSLMN`l@+>MjFe51(>cLLO)x|+qs!`IRQeKL!KE-CGaeJ z)s(4W+%;8&A4ynpA6(P|G$aJQWs~t^zGKDJd=K~ zLpN0LnUwYcs~^Tn^jZAsBBWP(h)HhYV$sK{H?JR@GW42CUGQQ}AGj6D2K`)-gItah zk)$jyEHC#k0;%^Q)Jckw56#T48z@fUIa)*g#Ft5{Gvqi@qNGEO_f%pSaE91%sep(f z%+$;;@$AsyBd$}z{j{I~8We8nizBX6e~^S;q28@gnM9%@#ljI(>kr+65P!V;+06HB z%H60Aum(?yeiVS8=Cz-=5NcXzKgfyHA20%oTHZe&t`h}G^XQ}8{r>_-BfBba|BwZ$ z0NLfGr3+QAGj1Qubo_(a_9*KwM~X6|WL&2YVayI=<0#@)aLy;2HorP0ZT^QltVivj zu=7`O$ifMb+D*Eh)Dd>m)4U~64LQBb!lsYBaLe?O*P^J^!#|EI@gi4o-CUMC`$8{> zaY+1S9B+4bH|eaXsL{V`Yq&QhJZAlClmL2Al;5Bgw*>2%2}q;G#Hr74U89s;`H?Q> zI{h*J(%e;`aUHQpk*1F`xMaON13Y1zx;n$!fU~If5)lr0dJ|c2GO>m~3e*eY2>4l) zz-&_n91=TJ&y4P)QQ|?3QAiy{gj$}M=^PY$l;$aP4|uSm)VB|VwkU?AfB1nJAf{Z| zyf<*Q{&ip1YAEVQ7lpz#!}AXw46KSGp_cEqmX^)HW{o0TpfJ^mjiGxj3+?u{!#|P< zmFb0g!>mfZZntW9>d7R1#@9(ii{74|n37erzYUabrhH#Skv(;sp!;0kYToSZY&uk5 zfS`opdec=z2VBYNpfku@fE)MH^vbxooxT`R!1ZN=&tcm(9AsOTx5fX}tOu7fZU1ln zZk>htj2igJHrP5i-aO=XDON&AjCYaO0R%6KpHB&Eh~wQZ5ogUvNg4@~z8Vg$cOgQm z2S3=Agac&Ixgn%&9$$SG9Du3as5`s&G=>ggd08vOU77g4JZmncz_|!uMa{;YpE3VF znQUQ0Xl^2%z(u1r|Cx-_BZ*}c&7rpI61$8eOmtaLm)Xths;Azgh-Z2r9bksUURY*9 zZygvO=jY|=T|6iSs+O=NTB<1jQ3PSwxq-WUS9gX9j7lO1LlN{(4|p@Iw1>VAwwZGT z!!poX{|{j z9kT`WINi%|c2UyWSI8?W4wJ!R?+FkIDUrjZ0Xn6!X6^M!ceHg-;}a>Ymu>Nf!KEL1No(R{{eJteyuqIDqSr57x}Er4{O7gn=jSS(-KHF-@ptyhu3)DG(VW-5mi*`ssiVsUb%C~y&WZWCxiHyq*5i?? z!|S7s@#VRVq2l6N!mVjcNtYuirS`i$Qjel^42AYtl*)2>w0rg?8k6p78EMg>{~T}e z%dri`^c+(iJ4M@;OY_4dbV>xgS)g!PU0HGAGO2`+T`AvQWy?c2sV3LH{kF7ptrWBr z+E@SjAv5_UYVbWG8p6XYPyhNjLG||$54%t~xPPU=-4fH&30S+OO*uo4-`g8~r4GQY^SW4}8 z9o`w$v3F&+E19J0^>PqbSWyN7q2lbwUDuH-)wttho{{{y=rT@|(u()nF&{?JNBg5O+MhU{-iUht zrG(rjb^iCfC{KjRP_<&?#v*GAH(!(=I;}v-6J4v`F!)Z=G zYJIM6_xdbr(}?E~`|vu~3tn+9rV%rnx%LS_X{;vGPjl{8R#yJ#U0dXGaIVOQ>%5l= zKK%Eh8-ZEh3OCkz-}VvC5+AoCyZW;{x*sQT#B}}Bu&YaugrB=YePM$^RNZb(yH9SW zbZcuX4NU2iQ?#*f=I2z`PqPHG+X$x~z)px&W zA#zGTEAn(FK(BDqzs(ubNt^tGDv1CrK@~z&9bj;dp9! z7A7HYZ#&nkx9K(xw@r4~nd=%oZt(HZZIeLaY@FD^=N$b+*7hAA)-JG~)8BsDt8HIt zquToH#Z>M)cxD{>c$nEY_9INfh|9zvQ->4#bYBy&k-F){3C4y4K z(91i@VtllLd;8GnkBSgIAkITHx z9YoePYZa$0Hq_!_~Bh^@ze?<>tX?whHhHTY38*1ID~%#ogu^7iYK2QBk`9s7!{ zVvL)vn}g^JG;nrkL(vU6*4w~}oyB??>kG=$wygtriL)yBpqZo+f_M@NUa zrdI0td&7L_{Y$RX-3`5c69(j6{kNAp<6o&}a^{!hy(^)?{bfKT^QI}ZcKf3QHEwZB z^%vyIqnA>rT(&Bl*QLA`|9cyE?}0gvCDnrVH`AqgNX1?dBCX*0OWA%oS3N^s4Wd3jwNeS0%lp zY@3{uD~hJkT9rq6l9#I`8(uhM+z^P;3Lh1sTzE~`G0rj6QarSfKzMg6yGxIH&p;+Q zJZ9)GKYGD;U!$>8W^eDm`MNU+8`&I5gqISVE5Z_~f(!_t*F&0j;YLl-%J+#BA3m+` zGW@c2(`@Er%7Yu(PpSz-xoh3MTYb2m(xd41*`02uBpoy;W*(7U=a=Oi)SKS~0$=cw zp@D%snFDL2H4)rAx67D{?~y}MBY@T{2{yVo4?B+osal$zQQ{2E&8$x~Gj=}4CKNI`U{$0!1 zqkalojZFL<6OuJTN1@Mt!yhgIT(P|9#foU-#z#lz6_s$WbMuj2N{<;9)?IApl#8up z`bpyd)+?cVU|>fUIgh-vw6?DN%<+-A{y6ThJ)#}bOo`B`&KZyo2P7kQ3$`2!Oc{+<$^*475R+JfarP(_-H7?hnJrD~Qq7)y z$`9{vpRJB*8FP!3|8Xbqf8W{Tt81?Vo4+L-i5(m!Vd9c$^Tw@S;Ia|V@Sv37a|8*U z^1kzCrZV%_2ov-JBJXJpX6J7E3=d1PL|wC)yNQfaWPmq+mFntc=8RDB+dX&6*fMKt z{c=a8w7E>)D-S>K*P7gRPh=cf4;ec$vC(x)tMx_mz2S99>dOmmB5^ZkO(xTM;4FeO z%{3u05va3<`*5&SI(1kUcW^kT6G)UlUgv(qWw0YJ`up(PoehbN@tp=w+_VO19*73` zq91W6S?s6IZ=XbW@}UPWGrJtV{4xTEHmIqoF(|zMu4TEJ^@?ja`V}RjwX2@frt5t& zU;;@x`sBgxgRNA!yl?}DMy>lQ-n(-LMfc>Cd1x@rWo9boW09(;gTEb*gw*EhX~TDU zY>}#nMQN^XUqs%t6mkU7LRipvt${kWWaL8bmSS##c%!LYd;g_)=Q9Gk2Vv{le+kEj ziVfM2l1oP(gy*(hXdHN*+jTXAvmUq7Dx`7fHa}Vd;cS<@2HyF3`h5{!w_=|k7v%|j zma7b*GN;51<7mDsbN5*I=q&GxMOF7J)b=zyqK%K*E>T6Si>_zqHeLPuA(w|Ok{^L6 z>xk43ouXZn3IEu>Tz%!KG_Q>0{{}Li*)_!-Fz~#O1ovM1$nm5;2qm6Hv)(OH)oKd9 zPU-ux`-sU+Evwk7i`xvRM9?vr7Xn^9+>R%FL2z0U0_eg&ZUfoULHN)9wa>iblkhzv zsHy3?tHZWCN02#mXIAc9Zl4Gmeh8pXA~I?KcHeTDg^q@)G*$M;{{kwS@&n-bn#e5* zDP{l~xXG^G6iT?Xl!?{AlgwfE`};%m5HC^B&k=97Vb~?Hzq8-eXQUDc1j}c@H_m#v zD0$rWx=ZkXTLl#+ghKqOkz6mCzxOReM-mQ}P8uo=)|~Z4&ej-(YiH0Jx;Fb4X`fVl zbvVZCd+z2|Fg)rS0TlMU6k-yyQj;b}T?7T<B=iAi%+B%AWG94@s&YB|Y z_2_VZkGe&=sL|2flNO&dIG&v1So(h}?BVc}eeHVuqtk}czZV~TPdoDL&h|IAzO&!f zhuYx%<#6h5x%Dfiv`*~s{POM91p5+M!-Itf=6vGbmHG1fx@f~Ie}VPjpa#4f|1-at XwYx2MItTCuMg|5?S3j3^P6 0$ is the **learning rate**. +::: + +```{code-cell} +:tags: [hide-input] + +from jaxtyping import Float, Array +from collections.abc import Callable +``` + +```{code-cell} +Params = Float[Array, " D"] + + +def gradient_descent( + loss: Callable[[Params], float], + θ_init: Params, + η: float, + epochs: int, +): + """ + Run gradient descent to minimize the given loss function + (expressed in terms of the parameters). + """ + θ = θ_init + for _ in range(epochs): + θ = θ - η * grad(loss)(θ) + return θ +``` + +## Linear regression + +In linear regression, we assume that the function $f$ is linear in the parameters: + +$$ +\mathcal{F} = \{ x \mapsto \theta^\top x \mid \theta \in \mathbb{R}^D \} +$$ + +This function class is extremely simple and only contains linear functions. +To expand its expressivity, we can _transform_ the input $x$ using some feature function $\phi$, +i.e. $\widetilde x = \phi(x)$, and then fit a linear model in the transformed space instead. + +```{code-cell} +def fit_linear(X: Float[Array, "N D"], y: Float[Array, " N"], φ=lambda x: x): + """Fit a linear model to the given dataset using ordinary least squares.""" + X = vmap(φ)(X) + θ = np.linalg.lstsq(X, y, rcond=None)[0] + return lambda x: np.dot(φ(x), θ) +``` + +## Neural networks + +In neural networks, we assume that the function $f$ is a composition of linear functions (represented by matrices $W_i$) and non-linear activation functions (denoted by $\sigma$): + +$$ +\mathcal{F} = \{ x \mapsto \sigma(W_L \sigma(W_{L-1} \dots \sigma(W_1 x + b_1) \dots + b_{L-1}) + b_L) \} +$$ + +where $W_i \in \mathbb{R}^{D_{i+1} \times D_i}$ and $b_i \in \mathbb{R}^{D_{i+1}}$ are the parameters of the $i$-th layer, and $\sigma$ is the activation function. + +This function class is much more expressive and contains many more parameters. +This makes it more susceptible to overfitting on smaller datasets, +but also allows it to represent more complex functions. +In practice, however, neural networks exhibit interesting phenomena during training, +and are often able to generalize well even with many parameters. + +Another reason for their popularity is the efficient **backpropagation** algorithm for computing the gradient of the empirical risk with respect to the parameters. +Essentially, the hierarchical structure of the neural network, +i.e. computing the output of the network as a composition of functions, +allows us to use the chain rule to compute the gradient of the output with respect to the parameters of each layer. + +{cite}`nielsen_neural_2015` provides a comprehensive introduction to neural networks and backpropagation. diff --git a/environment.yml b/environment.yml index c8d9ddd..767195a 100644 --- a/environment.yml +++ b/environment.yml @@ -16,7 +16,7 @@ dependencies: # book - jupyter-book - jupyterlab - - jupytext 1.16.* + - jupytext 1.16.2 - swig # github pages - ghp-import diff --git a/graphs.md b/graphs.md new file mode 100644 index 0000000..0dd9295 --- /dev/null +++ b/graphs.md @@ -0,0 +1,23 @@ +# Graphs + +Graphs are made with https://mermaid.js.org/syntax/flowchart.html + +```bash +pbpaste | mmdc -i - -o trajectory.png -b transparent +``` + +## Trajectories + +```mermaid +graph LR + S0($$s_0$$) -- $$\pi_0$$ --> A0{{$$a_0$$}} + S0 & A0 --> R0[$$r_0$$] + A0 & S0 -- $$P$$ --> S1($$s_1$$) + S1 -- $$\pi_1$$ --> A1{{$$a_1$$}} + S1 & A1 --> R1[$$r_1$$] + A1 & S1 -- $$P$$ --> S2($$s_2$$) + S2 -- $$\pi_2$$ --> A2{{$$a_2$$}} + S2 & A2 --> R2[$$r_2$$] + A2 & S2 -- $$P$$ --> S3($$s_3$$) +``` +